# Regression Model
In this notebook, we try to use nonlinear regression models in order to predict the levels of reaction to an event from different metrics that we have gathered. 

In [12]:
import pandas as pd 
import matplotlib as plt
import seaborn as sns
import numpy as np
import os
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve
from sklearn.kernel_ridge import KernelRidge
from sklearn.preprocessing import StandardScaler



In [41]:
# Standardize function : 
def stand(df):
    maxvalue = df.replace(np.inf,0.0).max().max()
    infvalreplace = 2 * maxvalue
    df.replace(np.inf,infvalreplace,inplace=True)
    myindex = df.index
    mycolumns = df.columns
    mylist = df.as_matrix()
    myshape = np.shape(mylist)
    std_scaler = StandardScaler().fit(np.array(mylist).reshape(-1, 1))
    scaled_list = std_scaler.transform(np.array(mylist).reshape(-1, 1)).reshape(myshape)   
    return pd.DataFrame(scaled_list,index=myindex,columns=mycolumns)


In [2]:
# We first define our local path to the Git Hub page repository : 
story_path = '../../../Data-Story/'
data_path = '../../../Project Data'
tweet_path = os.path.join(data_path,'Tweets')

#### Loadinf the Tweets Summary and Normalize them to the average 

In [3]:
event_list = [ev for ev in os.listdir(tweet_path) if ('.DS_Store' not in ev)]
i = 0
for event in event_list:
    
    i += 1
    summary_path = os.path.join(tweet_path,event,'Geocoded/summary.pickle')
    if i == 1 :
        summary_df = pd.read_pickle(summary_path)
        summary_df.columns = [event]
    else :
        event_df = pd.read_pickle(summary_path)
        event_df.columns = [event]
        summary_df = summary_df.join(event_df)
summary_df.fillna(0.0,inplace=True)

event_country = dict(zip(list(summary_df.columns),["FR", "TR",  "NG","US","BE", "PK", "LB", "NG"]))

# Setting 
all_tweets = summary_df.copy()
summary_norm_df = summary_df.copy()

for event, country in event_country.items():
    all_tweets.loc[country,event] = np.nan
for event in list(summary_df.columns):
    summary_norm_df.loc[:,event]  = summary_df.loc[:,event]/ all_tweets.mean(axis=1, skipna = True)
summary_norm_df.sort_index(inplace=True)
summary_norm_df.head()

Unnamed: 0_level_0,Charlie-Hebdo,Istanbul,Nigeria_2015,Orlando,Bruxelles,Pakistan,Lebanon,Nigeria_2016
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AD,4.915663,0.082616,0.0,2.203098,0.784854,0.013769,0.0,0.0
AE,3.310254,0.464227,0.170568,1.586108,0.669964,1.34872,0.436971,0.013188
AF,3.971223,0.141269,0.109876,2.192283,0.627861,0.81622,0.068018,0.07325
AG,3.356098,0.039024,0.039024,4.019512,0.429268,0.117073,0.0,0.0
AI,0.188764,0.0,0.026966,7.640449,0.107865,0.026966,0.0,0.008989


#### Loading additional Informations

In [4]:
country_data_path = os.path.join(data_path,'country_data.pickle')
c_data_raw_df = pd.read_pickle(country_data_path)
c_data_raw_df.reset_index(inplace=True)
country_data = c_data_raw_df[[('name',''),('ISO3',''),('ISO2',''),('Internet users',''),('gdp',''),('gdp_capita',''),('POP',''),('pop_pov','')]]
country_data.columns = country_data.columns.droplevel(level=1)
country_data.sort_values('ISO2',inplace=True)
country_data.reset_index().drop('index',axis=1,inplace=True)
country_data.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,name,ISO3,ISO2,Internet users,gdp,gdp_capita,POP,pop_pov
6,Andorra,AND,AD,83000.0,4800000000.0,37200.0,85660.0,0.143
7,United Arab Emirates,ARE,AE,5274000.0,375000000000.0,67700.0,5927482.0,0.195
1,Afghanistan,AFG,AF,2690000.0,18400000000.0,2000.0,33332025.0,0.358
13,Antigua and Barbuda,ATG,AG,60000.0,1303000000.0,24100.0,93581.0,
3,Anguilla,AIA,AI,12000.0,175400000.0,12200.0,16752.0,0.23


#### Loading Languages Distances : 

In [43]:
language_df = pd.read_pickle("./../LinkingLanguages/country_dist_languages.pkl")
language_df.sort_index(inplace=True)
language_df.sort_index(axis=1,inplace=True)

# Standardization : 
language_df = stand(language_df)

language_df.head()

Unnamed: 0,AD,AE,AF,AG,AI,AL,AM,AO,AQ,AR,...,VN,VU,WF,WS,XK,YE,YT,ZA,ZM,ZW
AD,-1.217014,1.492665,-0.398997,-0.245619,-0.245619,-0.654628,-0.526813,-0.910258,1.492665,-1.01251,...,1.492665,-0.381955,-0.75688,-0.245619,-0.526813,1.492665,-0.75688,-0.296745,-0.245619,-0.313787
AE,1.492665,-1.217014,-0.705754,1.492665,1.492665,1.492665,1.492665,1.492665,1.492665,1.492665,...,1.492665,1.492665,1.492665,1.492665,1.492665,-1.217014,1.492665,1.492665,1.492665,1.492665
AF,-0.398997,-0.705754,-1.217014,-0.450123,-0.450123,-0.859132,-0.731317,-0.194493,1.492665,-0.296745,...,1.492665,-0.313787,-0.143367,-0.450123,-0.731317,-0.705754,-0.143367,-0.501249,-0.450123,-0.552376
AG,-0.245619,1.492665,-0.450123,-1.217014,-1.217014,-0.705754,-0.577939,-0.041115,1.492665,-0.143367,...,1.492665,-1.217014,0.010011,-1.217014,-0.577939,1.492665,0.010011,-1.217014,-1.217014,-1.217014
AI,-0.245619,1.492665,-0.450123,-1.217014,-1.217014,-0.705754,-0.577939,-0.041115,1.492665,-0.143367,...,1.492665,-1.217014,0.010011,-1.217014,-0.577939,1.492665,0.010011,-1.217014,-1.217014,-1.217014


#### Loading Real Distance :

In [44]:
real_distance_df = pd.read_pickle("./../GeoMetrics/real_distance.pickle")
real_distance_df.columns = real_distance_df.columns.droplevel(level=0)
real_distance_df = real_distance_df*151 # Adapting the scale to km 
real_distance_df.sort_index(inplace=True)
real_distance_df.sort_index(axis=1,inplace=True)

# Standardization : 
real_distance_df = stand(real_distance_df)

real_distance_df.head()

Unnamed: 0_level_0,AD,AE,AF,AG,AI,AL,AM,AO,AQ,AR,...,VN,VU,WF,WS,XK,YE,YT,ZA,ZM,ZW
ISO2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AD,-1.779986,-0.621329,-0.443515,-0.359882,-0.342409,-1.393642,-0.873033,-0.581714,0.978196,0.316306,...,0.500219,1.873797,2.096938,2.022032,-1.370607,-0.655487,-0.312769,-0.219753,-0.444163,-0.350166
AE,-0.621329,-1.779986,-1.484147,0.634744,0.661789,-0.988737,-1.397871,-0.72015,0.845693,0.956868,...,-0.647637,0.71514,1.031162,1.110193,-0.993826,-1.554836,-0.991556,-0.512312,-0.826798,-0.736734
AF,-0.443515,-1.484147,-1.779986,0.880172,0.905429,-0.828618,-1.338921,-0.425802,1.115788,1.245736,...,-0.822025,0.575438,0.874016,0.951252,-0.845666,-1.264627,-0.740467,-0.23279,-0.543455,-0.457938
AG,-0.359882,0.634744,0.880172,-1.779986,-1.742129,-0.005828,0.493818,0.001052,0.792932,-0.716387,...,1.751003,1.036275,0.68364,0.607504,0.02742,0.505914,0.531795,0.246926,0.243952,0.280601
AI,-0.342409,0.661789,0.905429,-1.742129,-1.779986,0.01474,0.516584,0.036426,0.828815,-0.692257,...,1.779667,1.014965,0.662751,0.586973,0.047517,0.534952,0.565972,0.283821,0.279092,0.316367


#### Loading Hop Matrix

In [45]:
hop_distance_df = pd.read_pickle("./../GeoMetrics/hop_distance.pickle")
hop_distance_df.columns = hop_distance_df.columns.droplevel(level=0)
hop_distance_df.sort_index(inplace=True)
hop_distance_df.sort_index(axis=1,inplace=True)

# Standardization : 
hop_distance_df = stand(hop_distance_df)

hop_distance_df.head()

Unnamed: 0_level_0,AD,AE,AF,AG,AI,AL,AM,AO,AQ,AR,...,VN,VU,WF,WS,XK,YE,YT,ZA,ZM,ZW
ISO2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AD,-1.929962,-1.281621,-1.497734,0.663402,0.663402,-1.497734,-1.497734,-1.353659,0.663402,0.663402,...,-1.497734,0.663402,0.663402,0.663402,-1.497734,-1.281621,0.663402,-1.209583,-1.353659,0.663402
AE,-1.281621,-1.929962,-1.64181,0.663402,0.663402,-1.569772,-1.64181,-1.353659,0.663402,0.663402,...,-1.497734,0.663402,0.663402,0.663402,-1.497734,-1.785886,0.663402,-1.209583,-1.353659,0.663402
AF,-1.497734,-1.64181,-1.929962,0.663402,0.663402,-1.64181,-1.785886,-1.281621,0.663402,0.663402,...,-1.785886,0.663402,0.663402,0.663402,-1.569772,-1.64181,0.663402,-1.137545,-1.281621,0.663402
AG,0.663402,0.663402,0.663402,-1.929962,0.663402,0.663402,0.663402,0.663402,0.663402,0.663402,...,0.663402,0.663402,0.663402,0.663402,0.663402,0.663402,0.663402,0.663402,0.663402,0.663402
AI,0.663402,0.663402,0.663402,0.663402,-1.929962,0.663402,0.663402,0.663402,0.663402,0.663402,...,0.663402,0.663402,0.663402,0.663402,0.663402,0.663402,0.663402,0.663402,0.663402,0.663402


#### Loading Flisht Distances

In [46]:
flight_distance_df = pd.read_pickle("./../GeoMetrics/flight_routes.pickle")
flight_distance_df.columns = flight_distance_df.columns.droplevel(level=0)
flight_distance_df.sort_index(inplace=True)
flight_distance_df.sort_index(axis=1,inplace=True)

# Standardization : 
flight_distance_df = stand(flight_distance_df)

flight_distance_df.head()

Unnamed: 0_level_0,AD,AE,AF,AG,AI,AL,AM,AO,AQ,AR,...,VN,VU,WF,WS,XK,YE,YT,ZA,ZM,ZW
ISO2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AD,0.277678,0.277678,0.277678,-3.66751,-3.643886,0.277678,0.277678,0.277678,0.277678,0.277678,...,0.277678,0.277678,0.277678,0.277678,0.277678,0.277678,0.277678,0.277678,0.277678,0.277678
AE,0.277678,0.277678,-3.671868,0.277678,0.277678,0.277678,-3.685629,-3.685629,0.277678,0.277678,...,-3.682877,0.277678,0.277678,0.277678,0.277678,-3.660858,0.277678,-3.663611,-3.688381,0.277678
AF,0.277678,-3.212139,0.277678,0.277678,0.277678,0.277678,0.277678,0.277678,0.277678,0.277678,...,0.277678,0.277678,0.277678,0.277678,0.277678,0.277678,0.277678,0.277678,0.277678,0.277678
AG,-3.634436,0.277678,0.277678,0.277678,-3.634436,0.277678,0.277678,0.277678,0.277678,0.277678,...,0.277678,0.277678,0.277678,0.277678,0.277678,0.277678,0.277678,0.277678,0.277678,0.277678
AI,-2.897371,0.277678,0.277678,-3.294253,0.277678,0.277678,0.277678,0.277678,0.277678,0.277678,...,0.277678,0.277678,0.277678,0.277678,0.277678,0.277678,0.277678,0.277678,0.277678,0.277678


#### Loading Neiborhood 

In [9]:
neib_distance_df = pd.read_pickle("./../GeoMetrics/neighbor_influence_v2.pickle")
neib_distance_df.columns = neib_distance_df.columns.droplevel(level=0)
neib_distance_df.sort_index(inplace=True)
neib_distance_df.sort_index(axis=1,inplace=True)

# Standardization : 
flight_distance_df = stand(flight_distance_df)

neib_distance_df.head()

Unnamed: 0_level_0,AD,AE,AF,AG,AI,AL,AM,AO,AQ,AR,...,VN,VU,WF,WS,XK,YE,YT,ZA,ZM,ZW
ISO2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AD,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AG,,,,,,,,,,,...,,,,,,,,,,
AI,,,,,,,,,,,...,,,,,,,,,,


#### Religion Matrix

In [76]:
religion_distance_df = pd.read_pickle("./../GeoMetrics/rel_distance_df.pickle")
religion_distance_df.sort_index(inplace=True)
religion_distance_df.sort_index(axis=1,inplace=True)
religion_distance_df.dropna(axis=0, how='all').dropna(axis=1, how='all',inplace=True)

# Standardization : 
flight_distance_df = stand(flight_distance_df)

religion_distance_df.head()

Unnamed: 0,AD,AE,AF,AG,AI,AL,AM,AO,AQ,AR,...,VN,VU,WF,WS,XK,YE,YT,ZA,ZM,ZW
AD,0.0,0.76531,2.234728,0.687905,0.735152,0.579753,2.208188,2.077037,1.0,2.201636,...,2.002668,0.710898,0.99,0.751473,1.0,1.407864,1.0,0.373125,2.146582,2.140826
AE,0.76531,0.0,2.016003,0.967457,0.99719,0.244894,2.300721,2.198,1.259246,2.330086,...,2.143184,0.982383,1.251319,1.009818,1.259246,1.030272,1.259246,0.797648,2.248994,2.24326
AF,2.234728,2.016003,0.0,2.338209,2.352543,2.049274,1.367518,1.143718,1.412094,1.356912,...,1.001346,2.345077,2.444199,2.357694,1.412094,1.000018,1.412094,2.259053,1.265631,1.255845
AG,0.687905,0.967457,2.338209,0.0,0.050961,0.890464,2.017606,2.049255,1.21376,2.267645,...,2.112054,0.04531,1.136201,0.119971,1.21376,1.566938,1.21376,0.317545,2.004819,2.001444
AI,0.735152,0.99719,2.352543,0.050961,0.0,0.928957,2.011527,2.059332,1.241148,2.287629,...,2.128139,0.064498,1.177246,0.126143,1.241148,1.588247,1.241148,0.36632,2.004594,2.000246


## SVR Regression : 

### 1. Event : Orlando Attack : 

In [11]:
c_event = 'US'
event = 'Orlando'
# Y vector output
Y =  list(summary_norm_df.loc[:,event])
# X vector input
X = [];

# Adding Distances vectors to the country of the event :
X = X + list(language_df.loc[c_event,:])
X = X + list(real_distance_df.loc[c_event,:])
X = X + list(hop_distance_df.loc[c_event])
X = X + list(flight_distance_df.loc[:,c_event])
X = X + list(neib_distance_df.loc[:,c_event])
X = X + list(religion_distance_df.loc[:,c_event])

# Adding Other informations on all the countries :
X = X + list(country_data.loc[:])