# Exploratory Data and Feature Analysis FCDO

This notebook is meant to roughly explore the features and their relation with conflict outbreak.We fit a random forest model to see which value have the most predictive power when predicting conflict. The variables most capable of predicting conflict are also the ones deemed capable of explaining conflict in causal terms. 

In [86]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import logging.config
import difflib as dl
import sys

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor

In [87]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 1000)
pd.options.display.float_format = '{:,.4f}'.format

In [88]:
root = './Data/'
file = 'FCDO_data.csv'
data = pd.read_csv(root+file)

## Feature Engineering

In [89]:
class Prepare_Data:
    def __init__(self, data):
        """
        Class function to preprocess data to data that can we used for feature engineering. Imports the raw data and exports data that 
        immediatly fits random forest model. Since we are using random forest model, it is not necessary to scale data. Random forest models
        ares also able to deal with colinearity within features. Also since there are so many columns we do not rename them yet.
        :param :
        :param :
        """
            # Create logger
        log_format = '%(asctime)s - %(name)s - %(levelname)s - %(funcName)s - %(message)s'
        logging.basicConfig(format=log_format, level=logging.INFO, stream=sys.stdout)
        logger = logging.getLogger()
        self.data = data
        self.logger = logging.getLogger(__name__)

    def prepare_data(self):
        """
        This functions starts the pre processing activities for the random forest model
        :return:
        """
        self.logger.info("start preprocessing data")

        # Change the object types
        self.change_object_type()
        # Deal with awkward column string names
        self.deal_with_string_column_names()
        # Add new features based on range
        self.include_range_data()
        # Deal with missing data
        self.deal_with_missing_data()

        return self.data

    def change_object_type(self):
        """
        Since all the data are floats, we convert the data type.
        :return:
        """

        self.logger.info("start initiating data")
        data = self.data

        # Eliminate all rows where there is no sales price present
        data.replace(['--'], [np.nan], inplace=True)  # replace -- by np.nan
        data.loc[:, data.columns != 'ADM3_EN'] = data.loc[:, data.columns != 'ADM3_EN'].apply(pd.to_numeric, errors='raise')
        self.data=data

    def deal_with_string_column_names(self):
        """
        This functions adjusts column names.
        :return:
        """

        self.logger.info("deal with missing data")
        data = self.data
        
        # #Strip symbols from column names for efficient column selection
        data.columns = data.columns.str.replace("'","")
        data.columns = data.columns.str.replace("''","")
        data.columns = data.columns.str.replace("(","")
        data.columns = data.columns.str.replace(")","")

        self.data = data

    def include_range_data(self):
        """
        Create new features based on range between the minimum and the maximum value of that feature.
        :return:
        """

        self.logger.info("Start adding range features.")
        data = self.data
        
        # Find columns minimums and maximums. (Note: there are no min/max columns (without capital letter))
        min_cols = [col for col in data.columns if 'Min' in col]
        max_cols = [col for col in data.columns if 'Max' in col]
        
        # Find which column from min corresponds with max: #https://docs.python.org/3/library/difflib.html#difflib.get_close_matches
        for min_col in min_cols:
            max_col = dl.get_close_matches(min_col, max_cols, n=1)[0]
            rangename=min_col.replace('Min','Range')
            data[rangename] = data[max_col]-data[min_col]

        self.data=data
        
    def deal_with_missing_data(self):
        """
        This functions deals with missing data and replaces it with the nodes. 
        :return:
        """

        self.logger.info("deal with missing data")
        data = self.data
        
        #Replace inf by NaN values
        data.replace([np.inf, -np.inf], np.nan, inplace=True)
        
        #Fill NaN values
        data.fillna(data.mean(), inplace=True)
        
        self.data = data



## Random Forest Model

In [90]:
class ForecastModel:
    def __init__(self, data):
        """
        :param data:
        """
        self.logger = logging.getLogger(__name__)
        self.fatalities_columns = ['Year Made','Machine Size','Model Description','Auctioneer ID', 'Model ID','Coupler']
        self.data = data
        self.dependent_variable = 'total_fatalities, '
        self.non_prediction_variables = ['ADM3_EN','total_fatalities, ']
        self.feature_performance = pd.DataFrame(columns={'Feature_name', 'Feature_importance'})

    def calculate(self):
        """
        This functions starts the individual functions for running and evaluating the machine learning model
        :return:
        """
        self.logger.info("start training and predicting module")

        # Split the data set into train and test
        self.split_data()
        # train the model
        self.fit_model()
        # Make predictions after training
        self.predict_model()
        # Return the R2 ans RMSE performance of the model
        self.return_performance()
        # Return the performance of the individual features
        self.feature_scoring()

        return self.R2, self.RMSE, self.feature_performance

    def split_data(self):
        """
        Splits data into training and test (use 'rule of thumb' 4/5 for training)
        :return:
        """
        self.logger.info("start splitting data")
        X = self.data.loc[:,~(self.data.columns.isin(self.non_prediction_variables))]
        y = self.data.loc[:,self.dependent_variable]
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.2, random_state=3)

    def fit_model(self):
        """
        Fit the model
        :return:
        """
        self.logger.info("start fitting model")
        # No additional parameters given
        model = RandomForestRegressor()
        model.fit(self.X_train , self.y_train)
        self.model=model

    def predict_model(self):
        """
        Make predictions with the trained model.
        """
        self.logger.info("start predicting model")
        self.y_test_predict = self.model.predict(self.X_test)

    def return_performance(self):
        """
        Return overal performance in the form of RMSE and R2 score.
        """
        self.logger.info("start returning performance")
        self.RMSE = (np.sqrt(mean_squared_error(self.y_test, self.y_test_predict)))
        self.R2 = (r2_score(self.y_test, self.y_test_predict))

    def feature_scoring(self):
        """
        Return feature performance by inbuilt function.
        """
        self.logger.info("start returning feature scoring")
        feature_performance = self.feature_performance
        feature_performance['Feature_importance'] = self.model.feature_importances_
        for i in range(0,len(feature_performance)):
            feature_performance.loc[i,'Feature_name']=self.X_train.columns[i]

        self.feature_performance=feature_performance



## Applying Models

In [91]:
pre_processing = Prepare_Data(data)
prepared_data = pre_processing.prepare_data()

2021-12-16 16:57:49,151 - __main__ - INFO - prepare_data - start preprocessing data
2021-12-16 16:57:49,153 - __main__ - INFO - change_object_type - start initiating data
2021-12-16 16:57:49,276 - __main__ - INFO - deal_with_string_column_names - deal with missing data
2021-12-16 16:57:49,279 - __main__ - INFO - include_range_data - Start adding range features.


  data.columns = data.columns.str.replace("(","")
  data.columns = data.columns.str.replace(")","")
  data[rangename] = data[max_col]-data[min_col]


2021-12-16 16:57:49,971 - __main__ - INFO - deal_with_missing_data - deal with missing data


  data.fillna(data.mean(), inplace=True)


In [92]:
RF_model = ForecastModel(prepared_data)
R2, RMSE, feature_performance=RF_model.calculate()

2021-12-16 16:57:51,322 - __main__ - INFO - calculate - start training and predicting module
2021-12-16 16:57:51,323 - __main__ - INFO - split_data - start splitting data
2021-12-16 16:57:51,326 - __main__ - INFO - fit_model - start fitting model
2021-12-16 16:57:54,268 - __main__ - INFO - predict_model - start predicting model
2021-12-16 16:57:54,279 - __main__ - INFO - return_performance - start returning performance
2021-12-16 16:57:54,280 - __main__ - INFO - feature_scoring - start returning feature scoring


In [93]:
feature_performance.sort_values('Feature_importance', ascending=False, inplace=True)

In [94]:
feature_performance.head(30)

Unnamed: 0,Feature_name,Feature_importance
381,"fatalities, Battles",0.2752
382,"fatalities, Explosions/Remote violence",0.2484
374,Battles,0.159
375,Explosions/Remote violence,0.0726
255,ECMWF_ERA5_LAND_MONTHLY_Soil_Level2_SD_Mean_Te...,0.0227
378,Strategic developments,0.0135
380,total_event_types,0.0115
249,ECMWF_ERA5_LAND_MONTHLY_Soil_Level1_SD_Mean_Te...,0.0069
384,"fatalities, Riots",0.0062
11,ECMWF_ERA5_LAND_MONTHLY_SD_Max_Evaporation_Top...,0.0061


## Run Model without Fatalities columns

In [117]:
data2 =data.loc[:, ~data.columns.isin(['fatalities, Riots', 'fatalities, Battles', 'fatalities, Protests'])]

In [118]:
pre_processing = Prepare_Data(data2)
prepared_data = pre_processing.prepare_data()

2021-12-16 17:02:49,611 - __main__ - INFO - prepare_data - start preprocessing data
2021-12-16 17:02:49,612 - __main__ - INFO - change_object_type - start initiating data
2021-12-16 17:02:49,728 - __main__ - INFO - deal_with_string_column_names - deal with missing data
2021-12-16 17:02:49,731 - __main__ - INFO - include_range_data - Start adding range features.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, val, pi)
  data.columns = data.columns.str.replace("(","")
  data.columns = data.columns.str.replace(")","")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[rangename] = data[max_col]-data[min_col]


2021-12-16 17:02:50,423 - __main__ - INFO - deal_with_missing_data - deal with missing data


  data.fillna(data.mean(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


In [119]:
RF_model = ForecastModel(prepared_data)
R2, RMSE, feature_performance=RF_model.calculate()

2021-12-16 17:02:52,260 - __main__ - INFO - calculate - start training and predicting module
2021-12-16 17:02:52,261 - __main__ - INFO - split_data - start splitting data
2021-12-16 17:02:52,264 - __main__ - INFO - fit_model - start fitting model
2021-12-16 17:02:55,290 - __main__ - INFO - predict_model - start predicting model
2021-12-16 17:02:55,300 - __main__ - INFO - return_performance - start returning performance
2021-12-16 17:02:55,302 - __main__ - INFO - feature_scoring - start returning feature scoring


In [120]:
feature_performance.sort_values('Feature_importance', ascending=False, inplace=True)


In [121]:
feature_performance.head(30)

Unnamed: 0,Feature_name,Feature_importance
381,"fatalities, Explosions/Remote violence",0.4772
374,Battles,0.2036
375,Explosions/Remote violence,0.066
380,total_event_types,0.0142
11,ECMWF_ERA5_LAND_MONTHLY_SD_Max_Evaporation_Top...,0.0109
245,ECMWF_ERA5_LAND_MONTHLY_Skin_SD_Min_Temperature,0.0093
255,ECMWF_ERA5_LAND_MONTHLY_Soil_Level2_SD_Mean_Te...,0.0087
449,NASA_NOAH01_SD_Range_Soil_Moisture_10cm_40cm,0.0069
97,NASA_NOAH01_SD_Min_Latent_Heat_Net_Flux,0.0065
261,ECMWF_ERA5_LAND_MONTHLY_Soil_Level3_SD_Mean_Te...,0.0058
