# Regression Model
In this notebook, we try to use nonlinear regression models in order to predict the levels of reaction to an event from different metrics that we have gathered. 

In [1]:
import pandas as pd 
import matplotlib as plt
import seaborn as sns
import numpy as np
import os
from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve
from sklearn.kernel_ridge import KernelRidge
from sklearn.preprocessing import StandardScaler



In [2]:
# Standardize function : 
def stand(df):
    maxvalue = df.replace(np.inf,0.0).max().max()
    infvalreplace = 2 * maxvalue
    df.replace(np.inf,infvalreplace,inplace=True)
    myindex = df.index
    mycolumns = df.columns
    mylist = df.as_matrix()
    myshape = np.shape(mylist)
    std_scaler = StandardScaler().fit(np.array(mylist).reshape(-1, 1))
    scaled_list = std_scaler.transform(np.array(mylist).reshape(-1, 1)).reshape(myshape)   
    return pd.DataFrame(scaled_list,index=myindex,columns=mycolumns)


In [3]:
# We first define our local path to the Git Hub page repository : 
story_path = '../../../Data-Story/'
data_path = '../../../Project Data'
tweet_path = os.path.join(data_path,'Tweets')

#### Loadinf the Tweets Summary and Normalize them to the average 

In [4]:
event_list = [ev for ev in os.listdir(tweet_path) if ('.DS_Store' not in ev)]
i = 0
for event in event_list:
    
    i += 1
    summary_path = os.path.join(tweet_path,event,'Geocoded/summary.pickle')
    if i == 1 :
        summary_df = pd.read_pickle(summary_path)
        summary_df.columns = [event]
    else :
        event_df = pd.read_pickle(summary_path)
        event_df.columns = [event]
        summary_df = summary_df.join(event_df)
summary_df.fillna(0.0,inplace=True)

summary_df = summary_df[summary_df.index != 'UM']
summary_df = summary_df[summary_df.index != 'AN']
summary_df = summary_df[summary_df.index != 'BQ']
summary_df = summary_df[summary_df.index != 'CS']
summary_df = summary_df[summary_df.index != 'SH']
summary_df = summary_df[summary_df.index != 'UM']
summary_df = summary_df[summary_df.index != 'YU']


event_country = dict(zip(list(summary_df.columns),["FR", "TR",  "NG","US","BE", "PK", "LB", "NG"]))

# Setting 
all_tweets = summary_df.copy()
summary_norm_df = summary_df.copy()

for event, country in event_country.items():
    all_tweets.loc[country,event] = np.nan
for event in list(summary_df.columns):
    summary_norm_df.loc[:,event]  = summary_df.loc[:,event]/ all_tweets.mean(axis=1, skipna = True)
summary_norm_df.sort_index(inplace=True)
summary_norm_df.fillna(0.0,inplace=True)

print(summary_df.shape)

summary_norm_df.head()

(247, 8)


Unnamed: 0_level_0,Bruxelles,Charlie-Hebdo,Istanbul,Lebanon,Nigeria_2015,Nigeria_2016,Orlando,Pakistan
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AD,0.784854,4.915663,0.082616,0.0,0.0,0.0,2.203098,0.013769
AE,0.669964,3.310254,0.464227,0.436971,0.170568,0.013188,1.586108,1.34872
AF,0.627861,3.971223,0.141269,0.068018,0.109876,0.07325,2.192283,0.81622
AG,0.429268,3.356098,0.039024,0.0,0.039024,0.0,4.019512,0.117073
AI,0.107865,0.188764,0.0,0.0,0.026966,0.008989,7.640449,0.026966


#### Loading additional Informations

In [5]:
country_data_path = os.path.join(data_path,'country_data.pickle')
c_data_raw_df = pd.read_pickle(country_data_path)
c_data_raw_df.reset_index(inplace=True)
country_data = c_data_raw_df[[('name',''),('ISO3',''),('ISO2',''),('Internet users',''),('gdp',''),('gdp_capita',''),('POP',''),('pop_pov','')]]
country_data.columns = country_data.columns.droplevel(level=1)
country_data.sort_values('ISO2',inplace=True)
country_data = country_data.reset_index().drop('index',axis=1)
country_data = country_data[country_data.ISO2 != 'UM']
print(country_data.shape)
country_data.head()


(247, 8)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,name,ISO3,ISO2,Internet users,gdp,gdp_capita,POP,pop_pov
0,Andorra,AND,AD,83000.0,4800000000.0,37200.0,85660.0,0.143
1,United Arab Emirates,ARE,AE,5274000.0,375000000000.0,67700.0,5927482.0,0.195
2,Afghanistan,AFG,AF,2690000.0,18400000000.0,2000.0,33332025.0,0.358
3,Antigua and Barbuda,ATG,AG,60000.0,1303000000.0,24100.0,93581.0,
4,Anguilla,AIA,AI,12000.0,175400000.0,12200.0,16752.0,0.23


In [6]:
# Standaridizing the c_data : 
c_feature_list = ['Internet users','gdp', 'gdp_capita','POP','pop_pov']

for c_feature in c_feature_list : 
    country_data[[c_feature]] = stand(country_data[[c_feature]].replace(np.nan,0.0))
print(country_data.shape)

country_data.head()

(247, 8)


Unnamed: 0,name,ISO3,ISO2,Internet users,gdp,gdp_capita,POP,pop_pov
0,Andorra,AND,AD,-0.240984,-0.205446,0.785482,-0.237876,-0.489764
1,United Arab Emirates,ARE,AE,-0.142697,0.047819,2.149951,-0.190638,-0.226474
2,Afghanistan,AFG,AF,-0.191623,-0.196142,-0.789249,0.030961,0.59884
3,Antigua and Barbuda,ATG,AG,-0.241419,-0.207839,0.199431,-0.237812,-1.213812
4,Anguilla,AIA,AI,-0.242328,-0.20861,-0.332935,-0.238433,-0.049259


#### Loading Languages Distances : 

In [7]:
language_df = pd.read_pickle("./../LinkingLanguages/country_dist_languages.pkl")
language_df.sort_index(inplace=True)
language_df.sort_index(axis=1,inplace=True)

language_df = language_df[language_df.index != 'UM']

# Standardization : 
language_df = stand(language_df)
print(language_df.shape)
language_df.head()

(247, 248)


Unnamed: 0,AD,AE,AF,AG,AI,AL,AM,AO,AQ,AR,...,VN,VU,WF,WS,XK,YE,YT,ZA,ZM,ZW
AD,-1.218223,1.49122,-0.400278,-0.246913,-0.246913,-0.655885,-0.528082,-0.911493,1.49122,-1.013736,...,1.49122,-0.383237,-0.758129,-0.246913,-0.528082,1.49122,-0.758129,-0.298034,-0.246913,-0.315075
AE,1.49122,-1.218223,-0.707007,1.49122,1.49122,1.49122,1.49122,1.49122,1.49122,1.49122,...,1.49122,1.49122,1.49122,1.49122,1.49122,-1.218223,1.49122,1.49122,1.49122,1.49122
AF,-0.400278,-0.707007,-1.218223,-0.451399,-0.451399,-0.860372,-0.732568,-0.195791,1.49122,-0.298034,...,1.49122,-0.315075,-0.14467,-0.451399,-0.732568,-0.707007,-0.14467,-0.502521,-0.451399,-0.553642
AG,-0.246913,1.49122,-0.451399,-1.218223,-1.218223,-0.707007,-0.579203,-0.042427,1.49122,-0.14467,...,1.49122,-1.218223,0.008695,-1.218223,-0.579203,1.49122,0.008695,-1.218223,-1.218223,-1.218223
AI,-0.246913,1.49122,-0.451399,-1.218223,-1.218223,-0.707007,-0.579203,-0.042427,1.49122,-0.14467,...,1.49122,-1.218223,0.008695,-1.218223,-0.579203,1.49122,0.008695,-1.218223,-1.218223,-1.218223


#### Loading Real Distance :

In [8]:
real_distance_df = pd.read_pickle("./../GeoMetrics/real_distance.pickle")
real_distance_df.columns = real_distance_df.columns.droplevel(level=0)
real_distance_df = real_distance_df*151 # Adapting the scale to km 
real_distance_df.sort_index(inplace=True)
real_distance_df.sort_index(axis=1,inplace=True)

# Standardization : 
real_distance_df = stand(real_distance_df)
print(real_distance_df.shape)

real_distance_df.head()

(247, 247)


Unnamed: 0_level_0,AD,AE,AF,AG,AI,AL,AM,AO,AQ,AR,...,VN,VU,WF,WS,XK,YE,YT,ZA,ZM,ZW
ISO2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AD,-1.779986,-0.621329,-0.443515,-0.359882,-0.342409,-1.393642,-0.873033,-0.581714,0.978196,0.316306,...,0.500219,1.873797,2.096938,2.022032,-1.370607,-0.655487,-0.312769,-0.219753,-0.444163,-0.350166
AE,-0.621329,-1.779986,-1.484147,0.634744,0.661789,-0.988737,-1.397871,-0.72015,0.845693,0.956868,...,-0.647637,0.71514,1.031162,1.110193,-0.993826,-1.554836,-0.991556,-0.512312,-0.826798,-0.736734
AF,-0.443515,-1.484147,-1.779986,0.880172,0.905429,-0.828618,-1.338921,-0.425802,1.115788,1.245736,...,-0.822025,0.575438,0.874016,0.951252,-0.845666,-1.264627,-0.740467,-0.23279,-0.543455,-0.457938
AG,-0.359882,0.634744,0.880172,-1.779986,-1.742129,-0.005828,0.493818,0.001052,0.792932,-0.716387,...,1.751003,1.036275,0.68364,0.607504,0.02742,0.505914,0.531795,0.246926,0.243952,0.280601
AI,-0.342409,0.661789,0.905429,-1.742129,-1.779986,0.01474,0.516584,0.036426,0.828815,-0.692257,...,1.779667,1.014965,0.662751,0.586973,0.047517,0.534952,0.565972,0.283821,0.279092,0.316367


#### Loading Hop Matrix

In [9]:
hop_distance_df = pd.read_pickle("./../GeoMetrics/hop_distance.pickle")
hop_distance_df.columns = hop_distance_df.columns.droplevel(level=0)
hop_distance_df.sort_index(inplace=True)
hop_distance_df.sort_index(axis=1,inplace=True)

# Standardization : 
hop_distance_df = stand(hop_distance_df)
print(hop_distance_df.shape)

hop_distance_df.head()

(247, 247)


Unnamed: 0_level_0,AD,AE,AF,AG,AI,AL,AM,AO,AQ,AR,...,VN,VU,WF,WS,XK,YE,YT,ZA,ZM,ZW
ISO2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AD,-1.929962,-1.281621,-1.497734,0.663402,0.663402,-1.497734,-1.497734,-1.353659,0.663402,0.663402,...,-1.497734,0.663402,0.663402,0.663402,-1.497734,-1.281621,0.663402,-1.209583,-1.353659,0.663402
AE,-1.281621,-1.929962,-1.64181,0.663402,0.663402,-1.569772,-1.64181,-1.353659,0.663402,0.663402,...,-1.497734,0.663402,0.663402,0.663402,-1.497734,-1.785886,0.663402,-1.209583,-1.353659,0.663402
AF,-1.497734,-1.64181,-1.929962,0.663402,0.663402,-1.64181,-1.785886,-1.281621,0.663402,0.663402,...,-1.785886,0.663402,0.663402,0.663402,-1.569772,-1.64181,0.663402,-1.137545,-1.281621,0.663402
AG,0.663402,0.663402,0.663402,-1.929962,0.663402,0.663402,0.663402,0.663402,0.663402,0.663402,...,0.663402,0.663402,0.663402,0.663402,0.663402,0.663402,0.663402,0.663402,0.663402,0.663402
AI,0.663402,0.663402,0.663402,0.663402,-1.929962,0.663402,0.663402,0.663402,0.663402,0.663402,...,0.663402,0.663402,0.663402,0.663402,0.663402,0.663402,0.663402,0.663402,0.663402,0.663402


#### Loading Flisht Distances

In [10]:
flight_distance_df = pd.read_pickle("./../GeoMetrics/flight_routes.pickle")
flight_distance_df.columns = flight_distance_df.columns.droplevel(level=0)
flight_distance_df.sort_index(inplace=True)
flight_distance_df.sort_index(axis=1,inplace=True)

# Standardization : 
flight_distance_df = stand(flight_distance_df)
print(flight_distance_df.shape)

flight_distance_df.head()

(247, 247)


Unnamed: 0_level_0,AD,AE,AF,AG,AI,AL,AM,AO,AQ,AR,...,VN,VU,WF,WS,XK,YE,YT,ZA,ZM,ZW
ISO2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AD,0.277678,0.277678,0.277678,-3.66751,-3.643886,0.277678,0.277678,0.277678,0.277678,0.277678,...,0.277678,0.277678,0.277678,0.277678,0.277678,0.277678,0.277678,0.277678,0.277678,0.277678
AE,0.277678,0.277678,-3.671868,0.277678,0.277678,0.277678,-3.685629,-3.685629,0.277678,0.277678,...,-3.682877,0.277678,0.277678,0.277678,0.277678,-3.660858,0.277678,-3.663611,-3.688381,0.277678
AF,0.277678,-3.212139,0.277678,0.277678,0.277678,0.277678,0.277678,0.277678,0.277678,0.277678,...,0.277678,0.277678,0.277678,0.277678,0.277678,0.277678,0.277678,0.277678,0.277678,0.277678
AG,-3.634436,0.277678,0.277678,0.277678,-3.634436,0.277678,0.277678,0.277678,0.277678,0.277678,...,0.277678,0.277678,0.277678,0.277678,0.277678,0.277678,0.277678,0.277678,0.277678,0.277678
AI,-2.897371,0.277678,0.277678,-3.294253,0.277678,0.277678,0.277678,0.277678,0.277678,0.277678,...,0.277678,0.277678,0.277678,0.277678,0.277678,0.277678,0.277678,0.277678,0.277678,0.277678


#### Loading Neiborhood 

In [11]:
neib_distance_df = pd.read_pickle("./../GeoMetrics/neighbor_influence_v2.pickle")
neib_distance_df.columns = neib_distance_df.columns.droplevel(level=0)
neib_distance_df.sort_index(inplace=True)
neib_distance_df.sort_index(axis=1,inplace=True)

# Handle the nan

neib_distance_df.replace(np.nan,0.0,inplace=True)

# Standardization : 
neib_distance_df = stand(neib_distance_df)
print(neib_distance_df.shape)

neib_distance_df.head()

(247, 247)


Unnamed: 0_level_0,AD,AE,AF,AG,AI,AL,AM,AO,AQ,AR,...,VN,VU,WF,WS,XK,YE,YT,ZA,ZM,ZW
ISO2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AD,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,...,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052
AE,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,...,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052
AF,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,...,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052
AG,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,...,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052
AI,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,...,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052,-0.068052


#### Religion Matrix

In [12]:
religion_distance_df = pd.read_pickle("./../GeoMetrics/rel_distance_df.pickle")
religion_distance_df.sort_index(inplace=True)
religion_distance_df.sort_index(axis=1,inplace=True)
religion_distance_df.replace(np.nan,np.inf,inplace=True)

religion_distance_df = religion_distance_df[religion_distance_df.index != 'UM']


# Standardization : 
religion_distance_df = stand(religion_distance_df)
print(religion_distance_df.shape)
religion_distance_df.head()

(247, 248)


Unnamed: 0,AD,AE,AF,AG,AI,AL,AM,AO,AQ,AR,...,VN,VU,WF,WS,XK,YE,YT,ZA,ZM,ZW
AD,-1.131223,1.680826,1.680826,1.680826,1.680826,1.680826,1.680826,1.680826,1.680826,1.680826,...,1.680826,1.680826,1.680826,1.680826,1.680826,1.680826,1.680826,1.680826,1.680826,1.680826
AE,1.680826,-1.131223,-0.883685,-0.138702,-0.115687,-0.927963,-0.020785,-0.121003,1.680826,-0.017705,...,-0.385963,-0.099547,0.018979,-0.032357,1.680826,-0.889153,1.680826,-0.328744,-0.006915,-0.086573
AF,1.680826,-0.883685,-1.131223,0.095836,0.116934,-0.68045,0.204065,0.11206,1.680826,0.206884,...,-0.152897,0.131733,0.240693,0.193421,1.680826,-1.125364,1.680826,-0.081709,0.216831,0.143632
AG,1.680826,-0.138702,0.095836,-1.131223,-1.097048,-0.327282,-0.964253,-1.104859,1.680826,-0.959235,...,-0.453083,-1.073614,-0.911526,-0.979876,1.680826,0.091193,1.680826,-0.810619,-0.945701,-1.055061
AI,1.680826,-0.115687,0.116934,-1.097048,-1.131223,-0.302168,-0.998428,-1.123412,1.680826,-0.993135,...,-0.419119,-1.107789,-0.945701,-1.014051,1.680826,0.11237,1.680826,-0.776476,-0.979876,-1.089236


###  Store each of the individual dataframes

In [21]:
file_path = os.path.join(os.getcwd(), "StandardizedData")

directory = os.path.dirname(file_path)

if not os.path.exists(directory):
    os.makedirs(directory)
    
dataframes = [religion_distance_df,neib_distance_df,flight_distance_df,hop_distance_df,real_distance_df,language_df]
names = ["religion", "neighbor", "flight", "hop", "real_distance", "language"]
for name, df in zip(names, dataframes) :
    df.to_pickle(name+".pkl")
    
summary_norm_df.to_pickle("summary_norm.pkl")

### Add A full Dataframe : 

In [14]:
full_features = pd.DataFrame(index = religion_distance_df.index)
full_features = pd.merge(full_features,religion_distance_df,right_index=True,left_index=True)
full_features = pd.merge(full_features,neib_distance_df,right_index=True,left_index=True)
full_features = pd.merge(full_features,flight_distance_df,right_index=True,left_index=True)
full_features = pd.merge(full_features,hop_distance_df,right_index=True,left_index=True)
full_features = pd.merge(full_features,real_distance_df,right_index=True,left_index=True)
full_features = pd.merge(full_features,language_df,right_index=True,left_index=True)
full_features = pd.merge(full_features,country_data.set_index('ISO2').loc[:,c_feature_list],right_index=True,left_index=True)

full_features.to_pickle('all_features.pickle')
full_features.head()

Unnamed: 0,AD_x,AE_x,AF_x,AG_x,AI_x,AL_x,AM_x,AO_x,AQ_x,AR_x,...,YE_y,YT_y,ZA_y,ZM_y,ZW_y,Internet users,gdp,gdp_capita,POP,pop_pov
AD,-1.131223,1.680826,1.680826,1.680826,1.680826,1.680826,1.680826,1.680826,1.680826,1.680826,...,1.49122,-0.758129,-0.298034,-0.246913,-0.315075,-0.240984,-0.205446,0.785482,-0.237876,-0.489764
AE,1.680826,-1.131223,-0.883685,-0.138702,-0.115687,-0.927963,-0.020785,-0.121003,1.680826,-0.017705,...,-1.218223,1.49122,1.49122,1.49122,1.49122,-0.142697,0.047819,2.149951,-0.190638,-0.226474
AF,1.680826,-0.883685,-1.131223,0.095836,0.116934,-0.68045,0.204065,0.11206,1.680826,0.206884,...,-0.707007,-0.14467,-0.502521,-0.451399,-0.553642,-0.191623,-0.196142,-0.789249,0.030961,0.59884
AG,1.680826,-0.138702,0.095836,-1.131223,-1.097048,-0.327282,-0.964253,-1.104859,1.680826,-0.959235,...,1.49122,0.008695,-1.218223,-1.218223,-1.218223,-0.241419,-0.207839,0.199431,-0.237812,-1.213812
AI,1.680826,-0.115687,0.116934,-1.097048,-1.131223,-0.302168,-0.998428,-1.123412,1.680826,-0.993135,...,1.49122,0.008695,-1.218223,-1.218223,-1.218223,-0.242328,-0.20861,-0.332935,-0.238433,-0.049259


In [15]:
full_positive_features = pd.DataFrame(index = religion_distance_df.index)

# Dealing with the df
for  df in [religion_distance_df,neib_distance_df,flight_distance_df,hop_distance_df,real_distance_df,language_df]:
    temp_df = df - df.min().min()
    full_positive_features = pd.merge(full_positive_features,temp_df,right_index=True,left_index=True)

# Dealing with the arrays of information : 
country_data.set_index('ISO2').loc[:,c_feature_list]

for c_feature in c_feature_list : 
    temp_df = country_data.set_index('ISO2')[[c_feature]] - country_data.set_index('ISO2')[[c_feature]].min().min()
    full_positive_features = pd.merge(full_positive_features,temp_df,right_index=True,left_index=True)

full_positive_features.to_pickle('all_features_pos.pickle')
full_positive_features.head()


Unnamed: 0,AD_x,AE_x,AF_x,AG_x,AI_x,AL_x,AM_x,AO_x,AQ_x,AR_x,...,YE_y,YT_y,ZA_y,ZM_y,ZW_y,Internet users,gdp,gdp_capita,POP,pop_pov
AD,0.0,2.812049,2.812049,2.812049,2.812049,2.812049,2.812049,2.812049,2.812049,2.812049,...,2.709443,0.460094,0.920188,0.97131,0.903148,0.001572,0.003284,1.664205,0.000693,0.724048
AE,2.812049,0.0,0.247539,0.992521,1.015536,0.20326,1.110438,1.01022,2.812049,1.113518,...,0.0,2.709443,2.709443,2.709443,2.709443,0.099858,0.256549,3.028674,0.047931,0.987338
AF,2.812049,0.247539,0.0,1.227059,1.248157,0.450773,1.335288,1.243283,2.812049,1.338107,...,0.511216,1.073553,0.715702,0.766824,0.66458,0.050933,0.012588,0.089473,0.26953,1.812652
AG,2.812049,0.992521,1.227059,0.0,0.034175,0.803941,0.16697,0.026364,2.812049,0.171988,...,2.709443,1.226918,0.0,0.0,0.0,0.001136,0.000891,1.078154,0.000757,0.0
AI,2.812049,1.015536,1.248157,0.034175,0.0,0.829055,0.132795,0.007811,2.812049,0.138088,...,2.709443,1.226918,0.0,0.0,0.0,0.000227,0.00012,0.545788,0.000135,1.164553


## SVR Regression : 

We use the previous features to create a non linear Regression Model. We are going to fit it with multiple outputs : 7 normalized tweets outputs per country. Once the model is trained we compute the error of th estimation  with the 8th output. 

In [16]:
def create_sets(event_list,c_event_list):
    
    
    for i in range(len(event_list)):
        
        # country of the event and its name : 
        c_event = c_event_list[i]
        event = event_list[i]
        
        # Y vector output
        if i == 0 : 
            Y =  np.array(list(summary_norm_df.loc[:,event])).reshape(1,-1)
        else : 
            Y =  np.r_[Y, np.array(list(summary_norm_df.loc[:,event])).reshape(1,-1)]        
        
        
        
        # X vector input
        # Adding Distances vectors to the country of the event :
        X_i = np.array(list(language_df.loc[:,c_event]))
        X_i = np.c_[X_i ,  np.array(list(real_distance_df.loc[:,c_event]))]
        X_i = np.c_[X_i ,  np.array(list(hop_distance_df.loc[:,c_event]))]
        X_i = np.c_[X_i ,  np.array(list(flight_distance_df.loc[:,c_event]))]
        X_i = np.c_[X_i ,  np.array(list(neib_distance_df.loc[:,c_event]))]
        X_i = np.c_[X_i ,  np.array(list(religion_distance_df.loc[:,c_event]))]
        # Adding Other informations on all the countries :
        X_i = np.c_[X_i,  np.array(list(country_data.loc[:,c_feature_list].as_matrix()))].reshape(1,-1)
        
        # Updating the global X vector 
        if i == 0 : 
            X =  X_i
        else :
            X = np.r_[X , X_i]
    
    
            
    return X, Y


In [17]:
event_list = ['Charlie-Hebdo','Istanbul','Nigeria_2015','Orlando','Bruxelles','Pakistan','Lebanon','Nigeria_2016']
c_event_list = ['FR','TR','NG','US','BE','PK','LB','NG']

In [18]:
 X , Y = create_sets(event_list, c_event_list)

We fit the SVR estimator multiple times with different outputs. 

In [19]:
SVRegression = MultiOutputRegressor(SVR(C=1.0, epsilon=0.2),-1)
SVRegression.fit(X[:7,:],Y[:7,:])

MultiOutputRegressor(estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.2, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
           n_jobs=-1)

In [20]:
SVRegression.score(X[7,:].reshape(1,-1),Y[7,:].reshape(1,-1))

0.0080971659919028341