# Awareness Modeling


In [None]:
%matplotlib inline
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import numpy as np
import networkx as nx
import seaborn as sns
from scipy import sparse, stats, spatial
from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import NMF, LatentDirichletAllocation


## Import the features from different pickles : 

In [None]:
c_data = pickle.load(open('../DataEnriching/data.pickle','rb'))

lang_feature_df = pd.read_pickle('../LinkingLanguages/stand_country_dist_languages.pkl')
fligh_routes_df = pd.read_pickle('../GeoMetrics/flight_routes_std.pickle')
hop_distance_df = pd.read_pickle('../GeoMetrics/hop_distance_std.pickle')
neighbor_influence_df = pd.read_pickle('../GeoMetrics/neighbor_influence.pickle')
dist_feature_df = pd.read_pickle('../GeoMetrics/real_distance_std.pickle')
religion_and_more_df = pd.read_pickle('../DataEnriching/Pickles for Milestone 3/6_feature_df.pickle')

# drop the name column : 
religion_and_more_df.drop('name',axis=1,inplace=True)

# shift the standardized values :
lang_feature_df = lang_feature_df - lang_feature_df.min().min()
fligh_routes_df = fligh_routes_df - fligh_routes_df.min().min()
hop_distance_df = hop_distance_df - hop_distance_df.min().min()
neighbor_influence_df = neighbor_influence_df - neighbor_influence_df.min().min()
dist_feature_df = dist_feature_df - dist_feature_df.min().min()
#religion_and_more_df = religion_and_more_df - religion_and_more_df.min().min()

#carefull the number of coutries is not the same in every pickle.
features_df = pd.concat([lang_feature_df,
           fligh_routes_df,
           hop_distance_df,
           neighbor_influence_df,
          dist_feature_df,
          religion_and_more_df],axis=1)
#features_df.sort_index(axis=1,inplace=True)

features_df[features_df.isnull().any(axis=1)].head(10)

In [None]:
def stand(mylist):
    std_scaler = StandardScaler().fit(np.array(mylist).reshape(-1, 1))
    return std_scaler.transform(np.array(mylist).reshape(-1, 1)) 

#### convert every Nan to a 0 
features_df.fillna(value=0,inplace=True)
# replace inf by a large number :
features_df.replace(np.inf, 1E5,inplace=True)

features_df.loc[:,('gov_type_num')] = features_df.loc[:,('gov_type_num')] - features_df.loc[:,('gov_type_num')].min()
features_df.loc[:,('area')] = features_df.loc[:,('area')] - features_df.loc[:,('area')].min()
features_df.index.is_unique

to_standadize = ['2016_gdp_capita','POP','area']
for i in to_standadize:
    
    features_df.loc[:,(i)] = stand(list(features_df.loc[:,(i)]))
    features_df.loc[:,(i)] -=  features_df.loc[:,(i)].min().min()
features_df.head(20)

In [None]:
features_df.loc[:,('religion')].head(10)

## Applying the NMF

In [None]:
features_mat = features_df.as_matrix()


In [None]:
n_samples = np.shape(features_mat)[1]
n_features = 1000
n_components = round(n_samples/15)
n_components

In [None]:
W = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit_transform(features_mat)

In [None]:
np.shape(W)

In [None]:
plt.figure(figsize=[10,10])
sns.distplot(W.flatten())

## Weight Matrix and Graph

In [None]:
distances = spatial.distance.squareform(spatial.distance.pdist(W,'cosine')) # W
plt.figure(figsize=[9,7])
sns.heatmap(distances)

In [None]:
kernel_width = np.mean(distances)#distances.mean()

def kernel(x):
    return np.exp(- np.power(x/kernel_width,2))

# applying the kernel to the distances matrix :
weights = kernel(distances)
# setting the diagonal (self-loops) weights to 0 :
weights[range(len(weights)),range(len(weights))] = 0

# displaying a subset of the weights matrix :
plt.figure(figsize=[9,7])
sns.heatmap(weights,cmap="Reds")

In [None]:
weights_df = pd.DataFrame(weights,columns=c_data.index,index=c_data.index)

In [None]:
country = 'France'
weights_df[country].loc[weights_df[country] >= 0.7]

In [None]:
weights_df