# Exploratory Data and Feature Analysis FCDO

This notebook is meant to roughly explore the features and their relation with conflict outbreak. We try to infer which pre-preocessing steps are necessary in order to fit a random forest model. Also we use correlation coefficients to assess their ability to explain conflict.

In [66]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import logging.config
import difflib as dl

In [62]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 1000)
pd.options.display.float_format = '{:,.2f}'.format

In [36]:
# Import data
root = './Data/'
file = 'FCDO_data.csv'
data = pd.read_csv(root+file)

## Exploratory Analysis

In [37]:
data.sample(5)

Unnamed: 0,ADM3_EN,IDAHO_EPSCOR_TERRACLIMATE_Max_Climate_Water_Deficit,IDAHO_EPSCOR_TERRACLIMATE_SD_Max_Climate_Water_Deficit,IDAHO_EPSCOR_TERRACLIMATE_Mean_Climate_Water_Deficit,IDAHO_EPSCOR_TERRACLIMATE_SD_Mean_Climate_Water_Deficit,IDAHO_EPSCOR_TERRACLIMATE_Min_Climate_Water_Deficit,IDAHO_EPSCOR_TERRACLIMATE_SD_Min_Climate_Water_Deficit,ECMWF_ERA5_LAND_MONTHLY_Max_Evaporation_Bare,ECMWF_ERA5_LAND_MONTHLY_SD_Max_Evaporation_Bare,ECMWF_ERA5_LAND_MONTHLY_Max_Evaporation_Potential,...,Strategic developments,Violence against civilians,total_event_types,"('fatalities', 'Battles')","('fatalities', 'Explosions/Remote violence')","('fatalities', 'Protests')","('fatalities', 'Riots')","('fatalities', 'Strategic developments')","('fatalities', 'Violence against civilians')","('total_fatalities', '')"
189,Markaz Ana,3136,71.175878,1588.232887,42.992616920492175,169,66.902788,0.0,4.137043297674831e-07,-0.001448,...,11,0,19,0,8,0,0,1,0,9
162,Markaz Al-Kadhimiya,3064,49.875841,1560.738971,11.85578428443833,203,7.116356,-1.489324e-05,6.73731196573838e-07,-0.002253,...,3,9,37,9,5,0,0,0,4,18
221,Markaz Tilkaef,2856,41.614693,1057.869235,38.96659814083428,0,0.0,-0.0002089748,3.542985503604047e-05,-0.001792,...,7,3,24,26,13,0,0,0,5,44
60,Al-Muotasim,3238,123.040995,1547.2914,27.80471970536975,155,44.018793,-1.047505e-05,1.0347014324383652e-05,-0.001601,...,1,1,8,11,0,0,0,0,1,12
76,Al-Saadiya,3335,154.774969,1490.17849,42.72876919029818,37,56.888399,-3.003515e-07,2.0563184139741516e-05,-0.001011,...,4,4,31,10,6,0,0,0,3,19


In [38]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 268 entries, 0 to 267
Columns: 389 entries, ADM3_EN to ('total_fatalities', '')
dtypes: float64(203), int64(50), object(136)
memory usage: 814.6+ KB


There are 136 object types but it turns out they are not objects at all but just stored as objects because they contain '--', so we replace that by nan values. After that we convert everything (except administrative district) to float:

In [39]:
# replace -- by np.nan
data.replace(['--'], [np.nan], inplace=True)  # replace -- by np.nan
data.loc[:, data.columns != 'ADM3_EN'] = data.loc[:, data.columns != 'ADM3_EN'].apply(pd.to_numeric, errors='raise')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 268 entries, 0 to 267
Columns: 389 entries, ADM3_EN to ('total_fatalities', '')
dtypes: float64(338), int64(50), object(1)
memory usage: 814.6+ KB


In [40]:
data.describe()

Unnamed: 0,IDAHO_EPSCOR_TERRACLIMATE_Max_Climate_Water_Deficit,IDAHO_EPSCOR_TERRACLIMATE_SD_Max_Climate_Water_Deficit,IDAHO_EPSCOR_TERRACLIMATE_Mean_Climate_Water_Deficit,IDAHO_EPSCOR_TERRACLIMATE_SD_Mean_Climate_Water_Deficit,IDAHO_EPSCOR_TERRACLIMATE_Min_Climate_Water_Deficit,IDAHO_EPSCOR_TERRACLIMATE_SD_Min_Climate_Water_Deficit,ECMWF_ERA5_LAND_MONTHLY_Max_Evaporation_Bare,ECMWF_ERA5_LAND_MONTHLY_SD_Max_Evaporation_Bare,ECMWF_ERA5_LAND_MONTHLY_Max_Evaporation_Potential,ECMWF_ERA5_LAND_MONTHLY_SD_Max_Evaporation_Potential,...,Strategic developments,Violence against civilians,total_event_types,"('fatalities', 'Battles')","('fatalities', 'Explosions/Remote violence')","('fatalities', 'Protests')","('fatalities', 'Riots')","('fatalities', 'Strategic developments')","('fatalities', 'Violence against civilians')","('total_fatalities', '')"
count,268.0,268.0,267.0,267.0,268.0,268.0,265.0,265.0,265.0,265.0,...,268.0,268.0,268.0,268.0,268.0,268.0,268.0,268.0,268.0,268.0
mean,3078.511194,99.042011,1420.084923,44.954994,117.865672,29.129031,-3.762279e-05,2.892569e-05,-0.001575,0.000399,...,5.44403,3.597015,56.850746,16.626866,10.503731,0.41791,1.772388,0.119403,3.179104,32.619403
std,278.466702,96.542933,342.119412,21.483115,154.351454,29.852595,6.841757e-05,3.287351e-05,0.000628,0.000502,...,11.603332,8.050514,121.477577,43.897509,23.234434,3.140406,11.421957,0.554785,7.53951,68.876081
min,2145.0,20.233786,629.419212,7.863495,0.0,0.0,-0.0003488157,1.328332e-07,-0.003085,4.7e-05,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2974.0,57.420354,1174.810897,29.344222,0.0,0.0,-3.04007e-05,3.799419e-06,-0.002053,0.000162,...,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,3184.5,85.08801,1543.43356,41.20518,0.0,22.779419,-8.313451e-06,1.153981e-05,-0.001511,0.000278,...,1.0,1.0,14.5,2.0,1.0,0.0,0.0,0.0,1.0,6.0
75%,3250.25,120.8063,1671.400007,55.720365,212.0,45.751868,-9.872019e-08,5.282112e-05,-0.001298,0.000451,...,6.0,3.0,46.5,10.25,8.0,0.0,0.0,0.0,3.0,24.25
max,3757.0,1460.724776,2016.981785,130.977805,536.0,167.854296,0.0,0.0001296128,8e-06,0.003872,...,120.0,60.0,792.0,418.0,155.0,40.0,133.0,5.0,83.0,536.0


In [41]:
#Strip ),' and " from column names cause otherwise column selection is annoying
data.columns = data.columns.str.replace("'","")
data.columns = data.columns.str.replace("''","")
data.columns = data.columns.str.replace("(","")
data.columns = data.columns.str.replace(")","")

  data.columns = data.columns.str.replace("(","")
  data.columns = data.columns.str.replace(")","")


## Including Ranges

In [42]:
# Find columns minimums and maximums. (Note: there are no min/max columns (without capital letter))
min_cols = [col for col in data.columns if 'Min' in col]
max_cols = [col for col in data.columns if 'Max' in col]
print(len(max_cols), len(min_cols))

122 122


In [43]:
import difflib as dl
# Find which column from min corresponds with max: #https://docs.python.org/3/library/difflib.html#difflib.get_close_matches
for min_col in min_cols:
    max_col = dl.get_close_matches(min_col, max_cols, n=1)[0]
    rangename=min_col.replace('Min','Range')
    data[rangename] = data[max_col]-data[min_col]

  data[rangename] = data[max_col]-data[min_col]


In [44]:
data.head(5)

Unnamed: 0,ADM3_EN,IDAHO_EPSCOR_TERRACLIMATE_Max_Climate_Water_Deficit,IDAHO_EPSCOR_TERRACLIMATE_SD_Max_Climate_Water_Deficit,IDAHO_EPSCOR_TERRACLIMATE_Mean_Climate_Water_Deficit,IDAHO_EPSCOR_TERRACLIMATE_SD_Mean_Climate_Water_Deficit,IDAHO_EPSCOR_TERRACLIMATE_Min_Climate_Water_Deficit,IDAHO_EPSCOR_TERRACLIMATE_SD_Min_Climate_Water_Deficit,ECMWF_ERA5_LAND_MONTHLY_Max_Evaporation_Bare,ECMWF_ERA5_LAND_MONTHLY_SD_Max_Evaporation_Bare,ECMWF_ERA5_LAND_MONTHLY_Max_Evaporation_Potential,...,NASA_NOAH01_Range_Surface_Wind_Speed,NASA_NOAH01_SD_Range_Surface_Wind_Speed,inuncoast_historical_nosub_hist_Coastal_Flood_Range,inuncoast_historical_nosub_hist_Coastal_Flood_SD_Range,inunriverine_historical_nosub_hist_Riverine_Flood_Range,inunriverine_historical_nosub_hist_Coastal_Riverine_SD_Range,hazard_ls_arup_Landslide_Range,hazard_ls_arup_Landslide_SD_Range,geonode_gfdrrlab_Heatwave_intensity_returnperiod5y_Range,geonode_gfdrrlab_Heatwave_intensity_returnperiod5y_SD_Range
0,Abi Gharaq,3246,72.455552,1698.439815,24.124774,408,22.849043,-4e-06,3e-06,-0.003034,...,3.390722,-0.037728,0,0.0,4,1.750647,0,0.0,1,0.0
1,Abu Dalf,3254,66.514535,1573.475734,23.370776,27,45.19523,-1e-05,7e-06,-0.001444,...,3.525623,0.120121,0,0.0,2,0.461249,2,0.0,2,0.0
2,Abu Ghraib,3206,96.117233,1602.843255,28.625457,203,44.268045,-1.5e-05,1.1e-05,-0.002253,...,3.60578,0.038813,0,0.0,4,1.323454,0,0.0,1,0.0
3,Agjalare,2771,72.290144,1049.910216,43.778659,0,0.0,-0.000263,6.2e-05,-0.001468,...,1.630532,-0.029991,0,0.0,32,4.062918,2,0.0,3,0.032621
4,Akaika,3239,40.780722,1815.596549,32.822756,391,30.965841,-4e-06,3e-06,-0.002439,...,3.699778,0.080373,0,0.0,0,0.0,0,0.0,4,0.216621


## Correlation Analysis:

In [74]:
non_prediction_variables = ['ADM3_EN','total_fatalities, ']

In [75]:
X = data.loc[:,~(data.columns.isin(non_prediction_variables))]
X.columns

Index(['IDAHO_EPSCOR_TERRACLIMATE_Max_Climate_Water_Deficit',
       'IDAHO_EPSCOR_TERRACLIMATE_SD_Max_Climate_Water_Deficit',
       'IDAHO_EPSCOR_TERRACLIMATE_Mean_Climate_Water_Deficit',
       'IDAHO_EPSCOR_TERRACLIMATE_SD_Mean_Climate_Water_Deficit',
       'IDAHO_EPSCOR_TERRACLIMATE_Min_Climate_Water_Deficit',
       'IDAHO_EPSCOR_TERRACLIMATE_SD_Min_Climate_Water_Deficit',
       'ECMWF_ERA5_LAND_MONTHLY_Max_Evaporation_Bare',
       'ECMWF_ERA5_LAND_MONTHLY_SD_Max_Evaporation_Bare',
       'ECMWF_ERA5_LAND_MONTHLY_Max_Evaporation_Potential',
       'ECMWF_ERA5_LAND_MONTHLY_SD_Max_Evaporation_Potential',
       ...
       'NASA_NOAH01_Range_Surface_Wind_Speed',
       'NASA_NOAH01_SD_Range_Surface_Wind_Speed',
       'inuncoast_historical_nosub_hist_Coastal_Flood_Range',
       'inuncoast_historical_nosub_hist_Coastal_Flood_SD_Range',
       'inunriverine_historical_nosub_hist_Riverine_Flood_Range',
       'inunriverine_historical_nosub_hist_Coastal_Riverine_SD_Range',
       'h

In [56]:
# Make correlation plot to find out what variables are most correlated to total_fatalities
correlations = data.corr()

In [64]:
# Take abs value since both negative and positive correlations have predictive/causal power to have an idea which variables are most related.
correlations['abs_total_fatalities']= np.abs(correlations['total_fatalities, '])
correlations.sort_values('abs_total_fatalities', ascending=False)['abs_total_fatalities']

total_fatalities,                                                      1.00
fatalities, Battles                                                    0.94
fatalities, Explosions/Remote violence                                 0.90
Battles                                                                0.90
total_event_types                                                      0.70
Explosions/Remote violence                                             0.68
Strategic developments                                                 0.62
IDAHO_EPSCOR_TERRACLIMATE_SD_Min_Precipitation_Accumulation            0.60
ECMWF_ERA5_SD_LAND_MONTHLY_Mean_Skin_Reservoir                         0.47
ECMWF_ERA5_SD_LAND_MONTHLY_Min_Skin_Reservoir                          0.47
fatalities, Violence against civilians                                 0.46
Violence against civilians                                             0.43
ECMWF_ERA5_LAND_MONTHLY_Mean_Skin_Reservoir                            0.41
NASA_NOAH01_