# Load packages

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
# Sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_validate, cross_val_predict
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
# Statsmodels
import statsmodels.formula.api as smf
from statsmodels.api import MNLogit

# Just to print prettier. Uncomment to see all (not important) warnings
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

# Load data

In [2]:
# Read data
df = pd.read_csv('Redes_2.csv')
# Drop Unnecessary Variables
df.drop(['Unnamed: 0', 'Subject_num'],axis=1, inplace=True)
# Have a look at the data
df.head()

Unnamed: 0,Subject_origin,Subject_residence,Mu,Regime,Average_degree,Betweenness,Closeness,Load_centrality,Assortativity,Clustering,...,EMPLOY,EDUCATIO,ASMOKE,MOS,AOS,ACCULTUR,ALEVEL,ego_language,alter_language,FMIG2
0,co,usa,-0.180268,Unclear,34.711111,0.00491,0.839868,0.00491,-0.204603,0.780457,...,14.0,0.31111,0.0,0.0,0.0,0.0,0.0,es,es,0.0
1,co,usa,-0.011112,Unclear,33.863636,0.005059,0.82804,0.005059,-0.02445,0.644311,...,33.0,0.73333,0.0,0.0,0.0,0.0,0.0,es,es,0.0
2,co,usa,0.72707,Standard,4.648649,0.029086,0.203734,0.029086,-0.245212,0.686175,...,30.0,0.66667,0.0,0.0,0.0,0.0,0.0,es,es,0.0
3,co,usa,-0.157215,Unclear,15.422222,0.016655,0.596,0.016655,0.07799,0.651957,...,,34.0,0.75556,0.0,0.0,0.0,0.0,es,es,0.0
4,co,usa,-0.239059,Inverted,29.244444,0.007799,0.768654,0.007799,-0.017031,0.705817,...,,44.0,0.97778,0.0,0.0,0.0,0.0,es,es,0.0


# Prepare and explore data

In [3]:
df.describe(include='all')

Unnamed: 0,Subject_origin,Subject_residence,Mu,Regime,Average_degree,Betweenness,Closeness,Load_centrality,Assortativity,Clustering,...,EMPLOY,EDUCATIO,ASMOKE,MOS,AOS,ACCULTUR,ALEVEL,ego_language,alter_language,FMIG2
count,473,473,473.0,473,473.0,473.0,473.0,473.0,473.0,473.0,...,171.0,171.0,152.0,151.0,139.0,139.0,139.0,473,473,473.0
unique,10,2,,3,,,,,,,...,,,,,,,,2,2,
top,do,sp,,Unclear,,,,,,,...,,,,,,,,es,es,
freq,154,282,,222,,,,,,,...,,,,,,,,368,368,
mean,,,-0.74317,,23.921957,0.015368,0.68761,0.014957,-0.018617,0.639269,...,1.909032,0.831709,0.276316,0.078587,0.016147,0.165787,0.197922,,,83.139535
std,,,13.524199,,13.589893,0.011334,0.217223,0.011679,0.213724,0.169064,...,7.332452,5.092979,2.239957,0.350677,0.119575,0.803888,0.740157,,,396.184672
min,,,-294.081935,,2.628571,0.0,0.130665,0.0,-0.695654,0.205819,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0
25%,,,-0.299994,,12.818182,0.006953,0.532497,0.005732,-0.139165,0.505203,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0
50%,,,-0.111711,,19.066667,0.014024,0.63203,0.013907,-0.004029,0.659861,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0
75%,,,0.100436,,40.666667,0.020487,0.938077,0.020487,0.024567,0.770016,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0


Some values of `mu` are way out of range (min = -294). This is clearly from divergences in the model. We mark observations greater than 10 (in absolute value) as `nan`

In [4]:
# Clean estimates for mu
df['Mu'] = df['Mu'].apply(lambda x: np.nan if x < -10 else x)
df['Mu'] = df['Mu'].apply(lambda x: np.nan if x > 10 else x)

In [5]:
# Number of nans in the data
df.isnull().sum()

Subject_origin         0
Subject_residence      0
Mu                     1
Regime                 0
Average_degree         0
                    ... 
ACCULTUR             334
ALEVEL               334
ego_language           0
alter_language         0
FMIG2                  0
Length: 384, dtype: int64

We simply remove the observation having a nan (more sophisticated approaches could be done, as replacing its value with the median `mu` of the individuals in his same class)

In [6]:
df.dropna(inplace = True)

In [None]:
g = sns.pairplot(df, hue="Subject_origin")
plt.show()

***Notes***
<ul>
    <li> Interesting sigmoid relationship closeness origin ~ mu</li>
    <li> Presence of severe collinearities in the data (this may cause numeric problems in linear models)</li>
    <li> The conditional distributions show clear differences in some of the variables (Number origin, Average degree, clustering)</li>
    
</ul>

### Group some nationalities in `others` group

In [None]:
# There are few data on several Origins
count_origins = pd.get_dummies(df['Subject_origin']).sum()
print(count_origins)

<p>We keep only classes with more than 50 observations</p>

In [None]:
t = 50 # threshold
df['Subject_origin'] = df['Subject_origin'].apply(lambda x: 'aa-other' if (count_origins[x] < t) else x)
pd.get_dummies(df['Subject_origin']).sum()

In [None]:
g = sns.pairplot(df, hue="Subject_origin")
plt.savefig('pair_plot.pdf',dpi=600)
plt.show()

In [None]:
# mu vs closseness origin
sns.scatterplot(x='Mu',y='Closeness_origin', data = df,hue="Subject_origin");
plt.savefig('sigmoid_mu_clos.pdf',dpi=600)
plt.show()

Interesting sigmoid relationship for further exploring (fitting)

# INFERENCE

Reference for interpreting results

https://stats.idre.ucla.edu/stata/dae/multinomiallogistic-regression/

In [None]:
print('List of variables in data. The target is Subject_origin')
list(df.columns)

**Note**: For model selection it is important to know how to justify the predictors we use. So far, I have used a somewhat ad-hoc selection. Using all predictors causes numerical errors (most likely due to collinearity). What I did was removing some until I got a model with interesting variables and no numerical errors.

In [None]:
predictors = ['Closeness','Closeness_residence','Closeness_origin','Clustering','Mu','Average_degree',
             'Assortativity', 'Betweenness']

#all_predictors = df.columns[2:] # do not include Subject_residence
g = sns.pairplot(df[predictors + ["Subject_origin"]] , hue="Subject_origin")
# plt.savefig('pair_plot.pdf',dpi=600)
plt.show()

In [None]:
df.describe()

### Fit Multinomial Logistic Model

https://www.statsmodels.org/stable/generated/statsmodels.discrete.discrete_model.MNLogit.html

In [None]:
# uses the list 'predictors' as independent variables
formula_predictors = ' + '.join(predictors)
model = MNLogit.from_formula('Subject_origin ~ {}'.format(formula_predictors), df)
results = model.fit(maxiter=200)


#### Results

In [None]:
print(results.summary())

In [None]:
print('pseudo r-squared = {}'.format(np.round(results.prsquared,2)))

"<i>While the R2 index is a more familiar concept to planner who are experienced in OLS, it is not as well behaved as the rho-squared measure, for ML estimation. Those unfamiliar with rho-squared should be forewarned that its values tend to be considerably lower than those of the R2 index...For example, values of 0.2 to 0.4 for rho-squared represent EXCELLENT fit.</i>"

https://stats.stackexchange.com/questions/82105/mcfaddens-pseudo-r2-interpretation

In [None]:
names = ['ar', 'do', 'ma', 'pu', 'se']
# Note: the reference value is 'aa-others'
estimated_odds_ratios = pd.DataFrame(results.params[results.pvalues < 0.05].values, 
                         columns = names,
                         index= results.params.index).apply(lambda x: np.exp(x))
estimated_odds_ratios # i.e. pr(cat=ar) / pr(cat=aa-other)

The values in the table above are the odds ratios for the significant predictors (p-value < 0.05). They mean that one unit increase in that variable ($x_1$) increases by that factor ($\beta_1$) the odds ratio of the observation belonging to that class (i.e. `ar`) with respect to the reference class (`others`). $$ Pr(ar) / Pr(other) \sim \beta_1*x_1  $$

**Note:** They can (and should) be further explored to understand the relevant factors that differentiate the nationalities

In [None]:
results.llr_pvalue

The above result is the chi-squared probability of getting a log-likelihood ratio statistic greater than llr. llr has a chi-squared distribution with degrees of freedom df_model. The likelihood ratio chi-square with a p-value ~ 0 tells us that our model as a whole fits significantly better than an empty model (i.e., a model with no predictors). See the following links for more details:

https://stats.stackexchange.com/questions/82105/mcfaddens-pseudo-r2-interpretation

https://www.statsmodels.org/devel/generated/statsmodels.discrete.discrete_model.DiscreteResults.prsquared.html#statsmodels.discrete.discrete_model.DiscreteResults.prsquared


# PREDICTION

We train and fit a powerful non-linear (and non-parametric) machine learnin classifier to the data; a Random Forest. There are many other alternatives, but tree based metods are very powerfull and there are new techniques to help identify relevant predictors.

In this section, we want to test wether this model can outperform significantly other null (dummy) classifiers. If that is the case (which it is), it confirms the hypothesis that the predictors have relevant information about the nationalities of the subjects.

### Prepare data

In [None]:
# Creating the Training and Test sets from data
# and splitting the data into independent and dependent variables

predictors = ['Closeness','Closeness_residence','Closeness_origin','Clustering','Mu','Average_degree',
             'Assortativity', 'Betweenness'] # The same used in the MNL model above.

y = df['Subject_origin'] # target variable
X = df[predictors]       # independent variables

test_size = 0.20 #maybe more is needed (20% is standard though)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state = 0)


In [None]:
# Standar Scaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform (X_test)

### Train and tune the model using k-cross fold validation

In [None]:
scoring = 'accuracy' #'f1_macro' # This chooses the metric to optimise during training (there are others!)
njobs=-1                         # This the number of cores used in your cpu (-1 means "all of them")
cv=5                             # the k in k-cross-fold validation
# RANDOM FOREST
print('\nFitting Random Forest\n')

rfc=RandomForestClassifier(random_state=0)
# Parameter combinations to explore
param_grid = { 
    'n_estimators': [75, 100,300,1000],
    'max_features': ['auto', None],
    'min_samples_split' :[2,6, 10, 14],
    'max_depth' : [10, 15, 30, 50,None],
    'max_samples' : [0.5 ,0.7, None],}


CV_rfc = GridSearchCV(estimator=rfc, 
                  param_grid=param_grid, 
                  scoring = scoring,
                  verbose=1,
                  n_jobs=njobs,
                  cv= cv)
CV_rfc.fit(X_train, y_train)

print('\nRandom Forest:')
print('Best Score: ', CV_rfc.best_score_)
print('Best Params: ', CV_rfc.best_params_)



### Evaluating the algorithm performance in the test set (unseen data)

In [None]:
y_pred = CV_rfc.predict(X_test)
print('Confusion Matrix:\n ', confusion_matrix(y_test,y_pred),'\n')
print(classification_report(y_test,y_pred),'\n')
print('Accuracy: {0:.2f}'.format(accuracy_score(y_test, y_pred),2))


### Compare this performance with  null models

In [None]:
#  relative prevalence of each class
rel_prev = (y.value_counts() / len(y))
print(rel_prev)

In [None]:
# Uniform Dummy Classifier (classifies randomly with p = 1/6)

# If the classifier randomly guesses: 
print('Acurracy of uniform dummy classifier: ',(((1/6) * y.value_counts()) / len(y)).sum()) # = 1/6

In [None]:
# Stratified Dummy Classifier (classifies randomly with p ~ prevalence of each class)
print('Acurracy of stratified dummy classifier: ',(rel_prev * y.value_counts()).sum() / len(y))

In [None]:
# Most frequent Dummy Classifier (classifies always in the most frequent class)
print('Acurracy of Most freq dummy classifier: ',rel_prev.max() )

In [None]:
# SKLEARN versions of the dummy classifiers (to double check and for convinience methods)

dummy = "stratified"# most_frequent, stratified, uniform
dummy_clf = DummyClassifier(strategy=dummy,random_state=0) 

#Mean accuracy null model (to check my numbers above)
#dummy_score = 0
#for _ in range(1000):
#    dummy_clf.fit(X, y)
#    dummy_score += dummy_clf.score(X, y)
#dummy_score = dummy_score / 1000   

# Actual accuracy of the dummy in the same train-test split as the RF model
dummy_clf.fit(X_train, y_train)
dummy_score = dummy_clf.score(X_test, y_test)
print('Mean accuracy of null ' + dummy +' model: {0:.2f}'.format(dummy_score),'\n')
print('Mean accuracy (in test) of RF model: {0:.2f}'.format(CV_rfc.score(X_test, y_test)),'\n')




In [None]:
# Confusion matrix and report of the selected dummy classifier

y_pred_dummy = dummy_clf.predict(X_test)
print('Confusion Matrix:\n\n ',confusion_matrix(y_test,y_pred_dummy),'\n')
print(classification_report(y_test,y_pred_dummy),'\n')
print('Accuracy: {0:.2f}'.format(accuracy_score(y_test, y_pred_dummy),2))

In [None]:
# Just for reference, the results of the RF Model

y_pred = CV_rfc.predict(X_test)
print('Confusion Matrix:\n\n ', confusion_matrix(y_test,y_pred),'\n')
print(classification_report(y_test,y_pred),'\n')
print('Accuracy: {0:.2f}'.format(accuracy_score(y_test, y_pred),2))

In [None]:
dummy_report = pd.DataFrame(classification_report(y_test,dummy_clf.predict(X_test), output_dict= True))

rfc_report = pd.DataFrame(classification_report(y_test,CV_rfc.predict(X_test), output_dict= True))

In [None]:
dummy_report

In [None]:
rfc_report

#### Increase in prediction power (percentage with respect to null model)

i.e. 100% means twice as good

In [None]:
final_table = ((rfc_report - dummy_report)*100 / dummy_report).drop('support').round(decimals=2)
final_table

This significant increases further support the claim that the predictors (based on ego-network properties) have useful information to predict the countries of origin of the individuals)

# Future work (ideas)

<ul>
    <li>Learning curves: could explore how getting more data would impact (improve) prediction results</li>
    <li>We could train other models (neural networks, SVM, etc) but I think the point has been made</li>
    <li>Feature engeneering. We should focus on feedback from Jose Luis to see what features are more interesting to explore. The analyses I present here focus on a somewhat arbitrary choice.</li>
    <li>Feature importance and interpreation: Random Forest are amenable to work on the relative importance of each of the features for each of the labels---see SHAP values for example. This together with the results from the multinomial LR may help to explain the results (and build a narrative)</li>
        
</ul>

In [None]:
df