<a href="https://colab.research.google.com/github/KiraFitzge/IWS2023/blob/main/Statistical_Comparisons.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Data Processing
import pandas as pd
import numpy as np
from numpy import mean

# Modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint
from sklearn.linear_model import LogisticRegression
from sklearn import svm

# Example of calculating the mcnemar test
from statsmodels.stats.contingency_tables import mcnemar

# Tree Visualisation
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz

# Graphing
import matplotlib.pyplot as plt

# for data balancing
from collections import Counter
from imblearn.over_sampling import RandomOverSampler

# Cross-validation
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV

In [None]:
# Data Cleaning
best = pd.read_csv('BEST.csv')
best = best.drop(['PARTICIPANT.NUMBER', 'GENDER', 'HANDEDNESS'], axis = 1)

best['MAXIMUM.EDUCATIONAL.LEVEL'] = best['MAXIMUM.EDUCATIONAL.LEVEL'].map({'High School':0,'University':1,'Postgraduate':2, 'Professional Training':3})

best.head(10)

# print(best['MAXIMUM.EDUCATIONAL.LEVEL'].unique())
# Create future datasets - untangle english and other languages, as well as look at
# the best number of features to look at

Unnamed: 0,AGE.IN.YEARS,MAXIMUM.EDUCATIONAL.LEVEL,PERCENTAGE.EXPOSED.SPANISH,PERCENTAGE.EXPOSED.BASQUE,PERCENTAGE.EXPOSED.ENGLISH,SELF.PERCEIVED.LEVEL.SPANISH,SELF.PERCEIVED.LEVEL.BASQUE,SELF.PERCEIVED.LEVEL.ENGLISH,INTERVIEW.MARK.SPANISH,INTERVIEW.MARK.BASQUE,INTERVIEW.MARK.ENGLISH,AGE.OF.ACQUISITION.SPANISH,AGE.OF.ACQUISITION.BASQUE,AGE.OF.ACQUISITION.ENGLISH,PICTURE.NAMING.TEST.SPANISH,PICTURE.NAMING.TEST.BASQUE,PICTURE.NAMING.TEST.ENGLISH,LEXTALE.TEST.SPANISH,LEXTALE.TEST.BASQUE,LEXTALE.TEST.ENGLISH
0,22,0,40,40,20,10,9,5,5,5,3,0,3,5,63,52,33,88.33,89,58.75
1,21,1,70,20,10,10,9,7,5,5,4,0,0,5,65,57,59,85.0,92,72.5
2,30,2,20,70,10,9,10,5,5,5,4,6,0,8,65,65,50,93.33,96,78.75
3,21,1,90,10,0,9,7,3,5,2,2,0,3,3,65,45,39,92.5,88,67.5
4,39,3,50,40,10,9,8,7,5,5,3,0,0,12,65,64,54,84.17,73,60.0
5,20,1,100,0,0,8,7,2,5,4,3,0,6,6,65,57,31,90.83,83,61.25
6,23,1,60,30,10,10,10,7,5,5,3,4,0,6,64,64,57,98.33,89,66.25
7,24,1,50,30,10,9,9,7,5,5,4,0,0,8,65,63,56,98.33,98,76.25
8,36,2,30,60,10,9,9,7,5,5,4,5,0,11,65,65,60,98.33,96,75.0
9,25,1,30,60,10,8,10,7,5,5,3,6,0,6,62,65,50,94.17,97,71.25


In [None]:
#create initial binary mapping of early vs. late stage

def earlyClassifier3(n):
  if n <= 3:
    return 0
  return 1

def earlyClassifier5(n):
  if n <= 5:
    return 0
  return 1


def earlyClassifier8(n):
  if n <= 8:
    return 0
  return 1



binaryAcquisition3 = list(map(earlyClassifier3, list(best['AGE.OF.ACQUISITION.ENGLISH'])))

binaryAcquisition5 = list(map(earlyClassifier5, list(best['AGE.OF.ACQUISITION.ENGLISH'])))

binaryAcquisition8 = list(map(earlyClassifier8, list(best['AGE.OF.ACQUISITION.ENGLISH'])))


# Create an English only dataframe 
EnglishOnly = pd.DataFrame()
EnglishOnly['AGE'] = best['AGE.IN.YEARS']
EnglishOnly['EDUCATION'] = best['MAXIMUM.EDUCATIONAL.LEVEL']
EnglishOnly['EXPOSURE'] = best['PERCENTAGE.EXPOSED.ENGLISH']
EnglishOnly['SELF.PERCEIVED'] = best['SELF.PERCEIVED.LEVEL.ENGLISH']
EnglishOnly['INTERVIEW'] = best['INTERVIEW.MARK.ENGLISH']
EnglishOnly['PICTURE.NAMING'] = best['PICTURE.NAMING.TEST.ENGLISH']
EnglishOnly['LEXTALE.TEST'] = best['LEXTALE.TEST.ENGLISH']

oversample = RandomOverSampler(sampling_strategy='minority')
# fit and apply the transform

X_over3, y_over3 = oversample.fit_resample(EnglishOnly, binaryAcquisition3)

X_over5, y_over5 = oversample.fit_resample(EnglishOnly, binaryAcquisition5)

X_over8, y_over8 = oversample.fit_resample(EnglishOnly, binaryAcquisition8)

#print(Counter(y_over))

#print(sum(binaryAcquisition6))
#print(sum(y_over))

X_train3, X_test3, y_train3, y_test3 = train_test_split(X_over3, y_over3, test_size=0.2, random_state = 1, stratify = y_over3)
X_train5, X_test5, y_train5, y_test5 = train_test_split(X_over5, y_over5, test_size=0.2, random_state = 1, stratify = y_over5)
X_train8, X_test8, y_train8, y_test8 = train_test_split(X_over8, y_over8, test_size=0.2, random_state = 1, stratify = y_over8)

#allVariables.head()


In [None]:
# Visualizations for random forest classifier

X_train5, X_test5, y_train5, y_test5 = train_test_split(X_over5, y_over5, test_size=0.2, random_state = 1, stratify = y_over5)
cv = KFold(n_splits=5, random_state=1, shuffle=True)

rf = RandomForestClassifier()
rf.fit(X_train5, y_train5)
y_pred5_rf = cross_val_predict(rf, X_test5, y_test5, cv=cv)

accuracy = accuracy_score(y_test5, y_pred5_rf)
precision5 = precision_score(y_test5, y_pred5_rf)
recall5 = recall_score(y_test5, y_pred5_rf)

print("Accuracy:", accuracy)
print("Precision:", precision5)
print("Recall:", recall5)

y_correct_rf = y_pred5_rf == y_test5


param_grid =  {'n_estimators': randint(50,500),
              'max_depth': randint(1,20)}

# Create a based model
# Instantiate the grid search model
rand_search = RandomizedSearchCV(rf, 
                                 param_distributions = param_grid, 
                                 n_iter=15, 
                                 cv=5)

# Fit the random search object to the data
rand_search.fit(X_train5, y_train5)
best_rf = rand_search.best_estimator_
# Fit the random search object to the data
y_pred5_rf_opt = best_rf.predict(X_test5)
accuracy = accuracy_score(y_test5, y_pred5_rf_opt)
precision5 = precision_score(y_test5, y_pred5_rf_opt)
recall5 = recall_score(y_test5, y_pred5_rf_opt)

print("Accuracy:", accuracy)
print("Precision:", precision5)
print("Recall:", recall5)

y_correct_rf_opt = y_pred5_rf_opt == y_test5

# Analysis for the SVM
clf = svm.SVC(kernel='linear')

clf.fit(X_train5, y_train5)
y_pred5_clf = cross_val_predict(clf, X_test5, y_test5, cv=cv)
y_correct_clf = y_pred5_clf == y_test5

accuracy = accuracy_score(y_test5, y_pred5_clf)
precision5 = precision_score(y_test5, y_pred5_clf)
recall5 = recall_score(y_test5, y_pred5_clf)

print("Accuracy:", accuracy)
print("Precision:", precision5)
print("Recall:", recall5)

space = dict()
space['gamma'] =  [1, 0.1, 0.01, 0.001]
space['C'] = [0.1, 1, 10, 100]

# Instantiate the grid search model
rand_search = RandomizedSearchCV(clf, space, 
                                 n_iter= 10, 
                                 cv=3, n_jobs = -1)

# Fit the random search object to the data
rand_search.fit(X_train5, y_train5)
best_clf = rand_search.best_estimator_
# Fit the random search object to the data
y_pred5_clf_opt = best_clf.predict(X_test5)
y_correct_clf_opt = y_pred5_clf_opt == y_test5

accuracy = accuracy_score(y_test5, y_pred5_clf_opt)
precision5 = precision_score(y_test5, y_pred5_clf_opt)
recall5 = recall_score(y_test5, y_pred5_clf_opt)

print("Accuracy:", accuracy)
print("Precision:", precision5)
print("Recall:", recall5)

# Testing for logistic regression
lr = LogisticRegression(max_iter=3000)

lr.fit(X_train5, y_train5)
y_pred5_lr = cross_val_predict(lr, X_test5, y_test5, cv=cv)
y_correct_lr = y_pred5_lr == y_test5
accuracy = accuracy_score(y_test5, y_pred5_lr)
precision5 = precision_score(y_test5, y_pred5_lr)
recall5 = recall_score(y_test5, y_pred5_lr)

print("Accuracy:", accuracy)
print("Precision:", precision5)
print("Recall:", recall5)



space = dict()
space['solver'] = ['newton-cg', 'lbfgs', 'liblinear']
space['penalty'] = ['l2']
space['C'] = [100, 10, 1.0, 0.1, 0.01]

#Create a based model
#Instantiate the grid search model
rand_search = RandomizedSearchCV(lr, space, 
                                 n_iter=15, 
                                 cv=5)

# Fit the random search object to the data
rand_search.fit(X_train5, y_train5)
best_lr = rand_search.best_estimator_
# Fit the random search object to the data
y_pred5_lr_opt = best_lr.predict(X_test5)
y_correct_lr_opt = y_pred5_lr_opt == y_test5

accuracy = accuracy_score(y_test5, y_pred5_lr_opt)
precision5 = precision_score(y_test5, y_pred5_lr_opt)
recall5 = recall_score(y_test5, y_pred5_lr_opt)

print("Accuracy:", accuracy)
print("Precision:", precision5)
print("Recall:", recall5)


data_crosstab = pd.crosstab(y_correct_rf_opt, y_correct_clf_opt)
print(data_crosstab)

# calculate mcnemar test
result = mcnemar(data_crosstab, exact=True)
# summarize the finding
print("SVM and Random Forest")
print('statistic=%.3f, p-value=%.3f' % (result.statistic, result.pvalue))

data_crosstab = pd.crosstab(y_correct_rf_opt, y_correct_lr_opt)
print(data_crosstab)

# calculate mcnemar test
result = mcnemar(data_crosstab, exact=True)
# summarize the finding
print("LR and Random Forest")
print('statistic=%.3f, p-value=%.3f' % (result.statistic, result.pvalue))

data_crosstab = pd.crosstab(y_correct_clf_opt, y_correct_lr_opt)
print(data_crosstab)

# calculate mcnemar test
result = mcnemar(data_crosstab, exact=True)
# summarize the finding
print("LR and SVM")
print('statistic=%.3f, p-value=%.3f' % (result.statistic, result.pvalue))


# Without optimization
data_crosstab = pd.crosstab(y_correct_rf, y_correct_clf)
print(data_crosstab)

# calculate mcnemar test
result = mcnemar(data_crosstab, exact=True)
# summarize the finding
print("SVM and Random Forest")
print('statistic=%.3f, p-value=%.3f' % (result.statistic, result.pvalue))

data_crosstab = pd.crosstab(y_correct_rf, y_correct_lr)
print(data_crosstab)

# calculate mcnemar test
result = mcnemar(data_crosstab, exact=True)
# summarize the finding
print("LR and Random Forest")
print('statistic=%.3f, p-value=%.3f' % (result.statistic, result.pvalue))

data_crosstab = pd.crosstab(y_correct_clf, y_correct_lr)
print(data_crosstab)

# calculate mcnemar test
result = mcnemar(data_crosstab, exact=True)
# summarize the finding
print("LR and SVM")
print('statistic=%.3f, p-value=%.3f' % (result.statistic, result.pvalue))

Accuracy: 0.6646341463414634
Precision: 0.6956521739130435
Recall: 0.5853658536585366
Accuracy: 0.7926829268292683
Precision: 0.8076923076923077
Recall: 0.7682926829268293
Accuracy: 0.6097560975609756
Precision: 0.65
Recall: 0.47560975609756095
Accuracy: 0.6951219512195121
Precision: 0.7857142857142857
Recall: 0.5365853658536586
Accuracy: 0.6280487804878049
Precision: 0.647887323943662
Recall: 0.5609756097560976
Accuracy: 0.7073170731707317
Precision: 0.7361111111111112
Recall: 0.6463414634146342
col_0  False  True 
row_0              
False     21     13
True      29    101
SVM and Random Forest
statistic=13.000, p-value=0.020
col_0  False  True 
row_0              
False     23     11
True      25    105
LR and Random Forest
statistic=11.000, p-value=0.029
col_0  False  True 
row_0              
False     41      9
True       7    107
LR and SVM
statistic=7.000, p-value=0.804
col_0  False  True 
row_0              
False     44     11
True      20     89
SVM and Random Forest
statist