# <span style="color: green">TP1 : Apprentissage supervisé</span>

# Selection des données 


In [107]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

dir = ""
featuresFileName = "acsincome_ca_features.csv"
labelsFileName = "acsincome_ca_labels.csv"

features = pd.read_csv(dir+featuresFileName)
labels = pd.read_csv(dir+labelsFileName)


# Standardisation des données

In [108]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_all = sc.fit_transform(features)

# Séparation en train set et test set

In [109]:
X_all, y_all = shuffle(features,labels,random_state = 1)

num_samples = int(len(X_all)*0.01)
X, y = X_all[:num_samples], y_all[:num_samples]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Fonctions pour l'evaluation des models

In [35]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score


def displayAcuracyScore(model, X_train, y_train, X_test, y_test):
    print("Train score : ",model.score(X_train,y_train))
    print("Test score : ",model.score(X_test,y_test))

def displayClassificationReport(model, X_test, y_test):
    y_pred = model.predict(X_test)
    classificationReport = classification_report(y_test,y_pred)
    print("Classification report :\n",classificationReport)

def displayConfusionMatrix(model, X_test, y_test):
    y_pred = model.predict(X_test)
    confusionMatrix = confusion_matrix(y_test,y_pred)
    print("Confusion Matrix :\n ",confusionMatrix)

def displayCrossValidationScore(model, X, y):
    scores = cross_val_score(model, X, np.ravel(y))
    print("Cross-Validation Scores :\n", scores)
    print("Mean Cross-Validation Score :\n", scores.mean())
    print("Standard Deviation of Cross-Validation Scores :\n", scores.std())

# Cas direct du RandomForest et des SVM

### Model Random Forest Classifier 

In [15]:
# import the regressor
from sklearn.ensemble import RandomForestClassifier

In [16]:

# create regressor object
classifier = RandomForestClassifier(n_estimators=110,bootstrap=True)
 
# fit the regressor with x and y data
classifier.fit(X_train, np.ravel(y_train))

### Evaluation 

In [17]:
# Accuracy
displayAcuracyScore(classifier, X_train, y_train, X_test, y_test)

# Classification_report
displayClassificationReport(classifier, X_test, y_test)

# Confusion matrix
displayConfusionMatrix(classifier, X_test, y_test)

# Cross-Validation
displayCrossValidationScore(classifier, X_train, y_train)

Train score :  1.0
Test score :  0.7882653061224489
Classification report :
               precision    recall  f1-score   support

       False       0.83      0.80      0.82       230
        True       0.73      0.77      0.75       162

    accuracy                           0.79       392
   macro avg       0.78      0.79      0.78       392
weighted avg       0.79      0.79      0.79       392

Confusion Matrix :
  [[184  46]
 [ 37 125]]
Cross-Validation Scores :
 [0.79233227 0.78913738 0.79552716 0.80191693 0.75320513]
Mean Cross-Validation Score :
 0.7864237732448595
Standard Deviation of Cross-Validation Scores :
 0.017138622695581494


### Model SVM Classifer

In [21]:
from sklearn import svm

In [29]:
svmClassifierLinear = svm.SVC(kernel = 'linear', random_state = 111, C=1.0)
svmClassifierPoly = svm.SVC(kernel ='poly', random_state = 111, C=10, degree=2)
svmClassifierRbf = svm.SVC(kernel = 'rbf', random_state = 111, C=1.0)
svmClassifierSig = svm.SVC(kernel = 'sigmoid', random_state = 111, C=1.0)

In [30]:
svmClassifierSig.fit(X_train, np.ravel(y_train))
svmClassifierLinear.fit(X_train, np.ravel(y_train))
svmClassifierPoly.fit(X_train, np.ravel(y_train))
svmClassifierRbf.fit(X_train, np.ravel(y_train))

In [37]:
# Sigmoid Kernel
print("Sigmoid Kernel")
displayAcuracyScore(svmClassifierSig, X_train, y_train, X_test, y_test)
displayClassificationReport(svmClassifierSig, X_test, y_test)
displayConfusionMatrix(svmClassifierSig, X_test, y_test)
displayCrossValidationScore(svmClassifierSig, X_train, y_train)

Sigmoid Kernel
Train score :  0.571611253196931
Test score :  0.5663265306122449
Classification report :
               precision    recall  f1-score   support

       False       0.64      0.61      0.62       230
        True       0.48      0.51      0.49       162

    accuracy                           0.57       392
   macro avg       0.56      0.56      0.56       392
weighted avg       0.57      0.57      0.57       392

Confusion Matrix :
  [[140  90]
 [ 80  82]]
Cross-Validation Scores :
 [0.54313099 0.58146965 0.57188498 0.58785942 0.69551282]
Mean Cross-Validation Score :
 0.5959715736872286
Standard Deviation of Cross-Validation Scores :
 0.05207084338012114


In [38]:
# Linear Kernel
print("Linear Kernel")
displayAcuracyScore(svmClassifierLinear, X_train, y_train, X_test, y_test)
displayClassificationReport(svmClassifierLinear, X_test, y_test)
displayConfusionMatrix(svmClassifierLinear, X_test, y_test)
displayCrossValidationScore(svmClassifierLinear, X_train, y_train)

Linear Kernel
Train score :  0.7832480818414322
Test score :  0.7627551020408163
Classification report :
               precision    recall  f1-score   support

       False       0.75      0.89      0.81       230
        True       0.79      0.59      0.67       162

    accuracy                           0.76       392
   macro avg       0.77      0.74      0.74       392
weighted avg       0.77      0.76      0.76       392

Confusion Matrix :
  [[204  26]
 [ 67  95]]
Cross-Validation Scores :
 [0.77635783 0.76357827 0.77955272 0.77955272 0.75961538]
Mean Cross-Validation Score :
 0.7717313836323421
Standard Deviation of Cross-Validation Scores :
 0.00845010139952901


In [39]:
# Poly Kernel
print("Poly Kernel")
displayAcuracyScore(svmClassifierPoly, X_train, y_train, X_test, y_test)
displayClassificationReport(svmClassifierPoly, X_test, y_test)
displayConfusionMatrix(svmClassifierPoly, X_test, y_test)
displayCrossValidationScore(svmClassifierPoly, X_train, y_train)

Poly Kernel
Train score :  0.7199488491048593
Test score :  0.7117346938775511
Classification report :
               precision    recall  f1-score   support

       False       0.76      0.73      0.75       230
        True       0.64      0.68      0.66       162

    accuracy                           0.71       392
   macro avg       0.70      0.71      0.71       392
weighted avg       0.71      0.71      0.71       392

Confusion Matrix :
  [[169  61]
 [ 52 110]]
Cross-Validation Scores :
 [0.69648562 0.69968051 0.74121406 0.68051118 0.74679487]
Mean Cross-Validation Score :
 0.7129372491193577
Standard Deviation of Cross-Validation Scores :
 0.02624409811851844


In [40]:
# Rbf Kernel
print("Rbf Kernel")
displayAcuracyScore(svmClassifierRbf, X_train, y_train, X_test, y_test)
displayClassificationReport(svmClassifierRbf, X_test, y_test)
displayConfusionMatrix(svmClassifierRbf, X_test, y_test)
displayCrossValidationScore(svmClassifierRbf, X_train, y_train)

Rbf Kernel
Train score :  0.7199488491048593
Test score :  0.7066326530612245
Classification report :
               precision    recall  f1-score   support

       False       0.72      0.83      0.77       230
        True       0.69      0.53      0.60       162

    accuracy                           0.71       392
   macro avg       0.70      0.68      0.68       392
weighted avg       0.70      0.71      0.70       392

Confusion Matrix :
  [[191  39]
 [ 76  86]]
Cross-Validation Scores :
 [0.72204473 0.71565495 0.73801917 0.69968051 0.73076923]
Mean Cross-Validation Score :
 0.7212337183583191
Standard Deviation of Cross-Validation Scores :
 0.013183311203030594


# GridSeachCV

### RandomForest

In [12]:
from sklearn.model_selection import GridSearchCV 

In [23]:
param_gridRF = {  
    'n_estimators': [100,110,200],
    'max_depth': [None,10, 20, 30, 40,],
    'max_features': ['log2', 'sqrt'],
     'min_samples_split': [2, 5, 10],
    'bootstrap' : [True, False]
}
modelRF = RandomForestClassifier()
RFGridSeach = GridSearchCV(modelRF,param_gridRF)
RFGridSeach.fit(X_train, np.ravel(y_train))

In [24]:
print("Best Random Forest Estimator \n", RFGridSeach.best_estimator_)

Best Random Forest Estimator 
 RandomForestClassifier(max_features='log2', min_samples_split=10,
                       n_estimators=200)


### SVM

In [9]:
param_gridSVM = {
    #'C': [0.1, 1,2],  
    #'gamma': [1, 0.1, 0.01], 
    'kernel': ['rbf','sigmoid','poly','linear']
}  
modelSVM = svm.SVC()
SVMGrid = GridSearchCV(modelSVM, param_gridSVM) 
SVMGrid.fit(X_train, np.ravel(y_train))

In [11]:
print("Best Random SVM Estimator\n", SVMGrid.best_estimator_)

Best Random SVM Estimator
 SVC(kernel='linear')


### AdaBoost

In [45]:
from sklearn.ensemble import AdaBoostClassifier

In [10]:
param_gridAB = {
    'n_estimators': [10, 20, 30, 50],
    'learning_rate': [0.01, 0.1]
}
modelAB = AdaBoostClassifier()
ABGrid = GridSearchCV(modelAB,param_gridAB)
ABGrid.fit(X_train, np.ravel(y_train))

In [11]:
print("Best Random AdaBoost Estimator\n", ABGrid.best_estimator_)

Best Random AdaBoost Estimator
 AdaBoostClassifier(learning_rate=0.1)


### GradientBoosting

In [44]:
from sklearn.ensemble import GradientBoostingClassifier

In [13]:
param_gridGB = {
    "min_samples_split": np.linspace(0.1, 0.5, 4),
    "min_samples_leaf": np.linspace(0.1, 0.5, 4),
    "max_depth":[3,5,8],
    "n_estimators":[8,10,12,14]
    }
modelGB = GradientBoostingClassifier()
GBGrid = GridSearchCV(modelGB,param_gridGB)
GBGrid.fit(X_train, np.ravel(y_train))

In [14]:
print("Best Random GradientBoosting Estimator\n", GBGrid.best_estimator_)

Best Random GradientBoosting Estimator
 GradientBoostingClassifier(max_depth=5, min_samples_leaf=0.1,
                           min_samples_split=0.1, n_estimators=14)


# Comparing the Best Classifer Found for Each Model

In [41]:
# Best random forest classifier

bestRandomForestClassifier = RandomForestClassifier(max_features='log2', min_samples_split=10, n_estimators=200)
bestRandomForestClassifier.fit(X_train, np.ravel(y_train))

displayAcuracyScore(bestRandomForestClassifier, X_train, y_train, X_test, y_test)
displayClassificationReport(bestRandomForestClassifier, X_test, y_test)
displayConfusionMatrix(bestRandomForestClassifier, X_test, y_test)
displayCrossValidationScore(bestRandomForestClassifier, X_train, y_train)

Train score :  0.928388746803069
Test score :  0.7831632653061225
Classification report :
               precision    recall  f1-score   support

       False       0.82      0.80      0.81       230
        True       0.73      0.75      0.74       162

    accuracy                           0.78       392
   macro avg       0.78      0.78      0.78       392
weighted avg       0.78      0.78      0.78       392

Confusion Matrix :
  [[185  45]
 [ 40 122]]
Cross-Validation Scores :
 [0.7827476  0.77635783 0.82108626 0.8115016  0.76282051]
Mean Cross-Validation Score :
 0.7909027607110674
Standard Deviation of Cross-Validation Scores :
 0.021918166605559338


In [42]:
# Best SVM classifier

bestSVMClassifier = svm.SVC(kernel='linear')
bestSVMClassifier.fit(X_train, np.ravel(y_train))

displayAcuracyScore(bestSVMClassifier, X_train, y_train, X_test, y_test)
displayClassificationReport(bestSVMClassifier, X_test, y_test)
displayConfusionMatrix(bestSVMClassifier, X_test, y_test)
displayCrossValidationScore(bestSVMClassifier, X_train, y_train)

Train score :  0.7832480818414322
Test score :  0.7627551020408163
Classification report :
               precision    recall  f1-score   support

       False       0.75      0.89      0.81       230
        True       0.79      0.59      0.67       162

    accuracy                           0.76       392
   macro avg       0.77      0.74      0.74       392
weighted avg       0.77      0.76      0.76       392

Confusion Matrix :
  [[204  26]
 [ 67  95]]
Cross-Validation Scores :
 [0.77635783 0.76357827 0.77955272 0.77955272 0.75961538]
Mean Cross-Validation Score :
 0.7717313836323421
Standard Deviation of Cross-Validation Scores :
 0.00845010139952901


In [46]:
# Best AdaBoost classifier

bestAdaBoostClassifier = AdaBoostClassifier(learning_rate=0.1)
bestAdaBoostClassifier.fit(X_train, np.ravel(y_train))

displayAcuracyScore(bestAdaBoostClassifier, X_train, y_train, X_test, y_test)
displayClassificationReport(bestAdaBoostClassifier, X_test, y_test)
displayConfusionMatrix(bestAdaBoostClassifier, X_test, y_test)
displayCrossValidationScore(bestAdaBoostClassifier, X_train, y_train)

Train score :  0.8005115089514067
Test score :  0.7882653061224489
Classification report :
               precision    recall  f1-score   support

       False       0.79      0.87      0.83       230
        True       0.79      0.67      0.72       162

    accuracy                           0.79       392
   macro avg       0.79      0.77      0.78       392
weighted avg       0.79      0.79      0.78       392

Confusion Matrix :
  [[201  29]
 [ 54 108]]
Cross-Validation Scores :
 [0.79872204 0.76357827 0.80511182 0.78913738 0.79487179]
Mean Cross-Validation Score :
 0.7902842631277137
Standard Deviation of Cross-Validation Scores :
 0.014329153750333194


In [47]:
# Best GradientBoosting classifier

bestGradientBoostingClassifier = GradientBoostingClassifier(max_depth=5, min_samples_leaf=0.1, min_samples_split=0.1, n_estimators=14)
bestGradientBoostingClassifier.fit(X_train, np.ravel(y_train))

displayAcuracyScore(bestGradientBoostingClassifier, X_train, y_train, X_test, y_test)
displayClassificationReport(bestGradientBoostingClassifier, X_test, y_test)
displayConfusionMatrix(bestGradientBoostingClassifier, X_test, y_test)
displayCrossValidationScore(bestGradientBoostingClassifier, X_train, y_train)

Train score :  0.8056265984654731
Test score :  0.7831632653061225
Classification report :
               precision    recall  f1-score   support

       False       0.81      0.82      0.82       230
        True       0.74      0.73      0.74       162

    accuracy                           0.78       392
   macro avg       0.78      0.78      0.78       392
weighted avg       0.78      0.78      0.78       392

Confusion Matrix :
  [[189  41]
 [ 44 118]]
Cross-Validation Scores :
 [0.79872204 0.77955272 0.8115016  0.78594249 0.79807692]
Mean Cross-Validation Score :
 0.7947591545834358
Standard Deviation of Cross-Validation Scores :
 0.011099466678993844


# Evaluation sur le model Nevada et Colorado 

### Nevada

In [48]:
dir = "TP2data/"
featuresFileNameNe = "acsincome_ne_allfeaturesTP2.csv"
labelsFileNameNe = "acsincome_ne_labelTP2.csv"

featuresNe = pd.read_csv(dir+featuresFileNameNe) 
labelsNe = pd.read_csv(dir+labelsFileNameNe)

X_all = sc.fit_transform(featuresNe)

X_all, y_all = shuffle(featuresNe, labelsNe, random_state = 1)

num_samples = 392
X_test, y_test = X_all[:num_samples], y_all[:num_samples]

In [49]:
displayAcuracyScore(bestRandomForestClassifier, X_train, y_train, X_test, y_test)
displayClassificationReport(bestRandomForestClassifier, X_test, y_test)
displayConfusionMatrix(bestRandomForestClassifier, X_test, y_test)
displayCrossValidationScore(bestRandomForestClassifier, X_train, y_train)

Train score :  0.928388746803069
Test score :  0.7448979591836735
Classification report :
               precision    recall  f1-score   support

           0       0.84      0.73      0.78       242
           1       0.64      0.77      0.70       150

    accuracy                           0.74       392
   macro avg       0.74      0.75      0.74       392
weighted avg       0.76      0.74      0.75       392

Confusion Matrix :
  [[176  66]
 [ 34 116]]
Cross-Validation Scores :
 [0.78913738 0.77316294 0.82108626 0.80191693 0.77884615]
Mean Cross-Validation Score :
 0.7928299336446301
Standard Deviation of Cross-Validation Scores :
 0.01718700300671231


In [50]:
displayAcuracyScore(bestSVMClassifier, X_train, y_train, X_test, y_test)
displayClassificationReport(bestSVMClassifier, X_test, y_test)
displayConfusionMatrix(bestSVMClassifier, X_test, y_test)
displayCrossValidationScore(bestSVMClassifier, X_train, y_train)

Train score :  0.7832480818414322
Test score :  0.7474489795918368
Classification report :
               precision    recall  f1-score   support

           0       0.77      0.84      0.80       242
           1       0.70      0.59      0.64       150

    accuracy                           0.75       392
   macro avg       0.74      0.72      0.72       392
weighted avg       0.74      0.75      0.74       392

Confusion Matrix :
  [[204  38]
 [ 61  89]]
Cross-Validation Scores :
 [0.77635783 0.76357827 0.77955272 0.77955272 0.75961538]
Mean Cross-Validation Score :
 0.7717313836323421
Standard Deviation of Cross-Validation Scores :
 0.00845010139952901


In [51]:
displayAcuracyScore(bestAdaBoostClassifier, X_train, y_train, X_test, y_test)
displayClassificationReport(bestAdaBoostClassifier, X_test, y_test)
displayConfusionMatrix(bestAdaBoostClassifier, X_test, y_test)
displayCrossValidationScore(bestAdaBoostClassifier, X_train, y_train)

Train score :  0.8005115089514067
Test score :  0.7448979591836735
Classification report :
               precision    recall  f1-score   support

           0       0.77      0.84      0.80       242
           1       0.70      0.59      0.64       150

    accuracy                           0.74       392
   macro avg       0.73      0.71      0.72       392
weighted avg       0.74      0.74      0.74       392

Confusion Matrix :
  [[204  38]
 [ 62  88]]
Cross-Validation Scores :
 [0.79872204 0.76357827 0.80511182 0.78913738 0.79487179]
Mean Cross-Validation Score :
 0.7902842631277137
Standard Deviation of Cross-Validation Scores :
 0.014329153750333194


In [52]:
displayAcuracyScore(bestGradientBoostingClassifier, X_train, y_train, X_test, y_test)
displayClassificationReport(bestGradientBoostingClassifier, X_test, y_test)
displayConfusionMatrix(bestGradientBoostingClassifier, X_test, y_test)
displayCrossValidationScore(bestGradientBoostingClassifier, X_train, y_train)

Train score :  0.8056265984654731
Test score :  0.7346938775510204
Classification report :
               precision    recall  f1-score   support

           0       0.80      0.76      0.78       242
           1       0.64      0.69      0.67       150

    accuracy                           0.73       392
   macro avg       0.72      0.73      0.72       392
weighted avg       0.74      0.73      0.74       392

Confusion Matrix :
  [[184  58]
 [ 46 104]]
Cross-Validation Scores :
 [0.79872204 0.77955272 0.8115016  0.78594249 0.79807692]
Mean Cross-Validation Score :
 0.7947591545834358
Standard Deviation of Cross-Validation Scores :
 0.011099466678993844


### Colorado

In [53]:
dir = "TP2data/"
featuresFileNameCo = "acsincome_co_allfeaturesTP2.csv"
labelsFileNameCo = "acsincome_co_labelTP2.csv"

featuresCo = pd.read_csv(dir+featuresFileNameCo)
labelsCo = pd.read_csv(dir+labelsFileNameCo)

X_all = sc.fit_transform(featuresCo)

X_all, y_all = shuffle(featuresCo, labelsCo, random_state = 1)

num_samples = 392
X_test, y_test = X_all[:num_samples], y_all[:num_samples]

In [54]:
displayAcuracyScore(bestRandomForestClassifier, X_train, y_train, X_test, y_test)
displayClassificationReport(bestRandomForestClassifier, X_test, y_test)
displayConfusionMatrix(bestRandomForestClassifier, X_test, y_test)
displayCrossValidationScore(bestRandomForestClassifier, X_train, y_train)

Train score :  0.928388746803069
Test score :  0.7525510204081632
Classification report :
               precision    recall  f1-score   support

           0       0.86      0.71      0.78       239
           1       0.64      0.82      0.72       153

    accuracy                           0.75       392
   macro avg       0.75      0.77      0.75       392
weighted avg       0.78      0.75      0.76       392

Confusion Matrix :
  [[169  70]
 [ 27 126]]
Cross-Validation Scores :
 [0.78913738 0.78594249 0.81789137 0.81469649 0.76602564]
Mean Cross-Validation Score :
 0.7947386745310068
Standard Deviation of Cross-Validation Scores :
 0.01932639296590475


In [55]:
displayAcuracyScore(bestSVMClassifier, X_train, y_train, X_test, y_test)
displayClassificationReport(bestSVMClassifier, X_test, y_test)
displayConfusionMatrix(bestSVMClassifier, X_test, y_test)
displayCrossValidationScore(bestSVMClassifier, X_train, y_train)

Train score :  0.7832480818414322
Test score :  0.7448979591836735
Classification report :
               precision    recall  f1-score   support

           0       0.78      0.82      0.80       239
           1       0.69      0.63      0.66       153

    accuracy                           0.74       392
   macro avg       0.73      0.72      0.73       392
weighted avg       0.74      0.74      0.74       392

Confusion Matrix :
  [[195  44]
 [ 56  97]]
Cross-Validation Scores :
 [0.77635783 0.76357827 0.77955272 0.77955272 0.75961538]
Mean Cross-Validation Score :
 0.7717313836323421
Standard Deviation of Cross-Validation Scores :
 0.00845010139952901


In [56]:
displayAcuracyScore(bestAdaBoostClassifier, X_train, y_train, X_test, y_test)
displayClassificationReport(bestAdaBoostClassifier, X_test, y_test)
displayConfusionMatrix(bestAdaBoostClassifier, X_test, y_test)
displayCrossValidationScore(bestAdaBoostClassifier, X_train, y_train)

Train score :  0.8005115089514067
Test score :  0.7602040816326531
Classification report :
               precision    recall  f1-score   support

           0       0.80      0.81      0.80       239
           1       0.70      0.68      0.69       153

    accuracy                           0.76       392
   macro avg       0.75      0.75      0.75       392
weighted avg       0.76      0.76      0.76       392

Confusion Matrix :
  [[194  45]
 [ 49 104]]
Cross-Validation Scores :
 [0.79872204 0.76357827 0.80511182 0.78913738 0.79487179]
Mean Cross-Validation Score :
 0.7902842631277137
Standard Deviation of Cross-Validation Scores :
 0.014329153750333194


In [57]:
displayAcuracyScore(bestGradientBoostingClassifier, X_train, y_train, X_test, y_test)
displayClassificationReport(bestGradientBoostingClassifier, X_test, y_test)
displayConfusionMatrix(bestGradientBoostingClassifier, X_test, y_test)
displayCrossValidationScore(bestRandomForestClassifier, X_train, y_train)

Train score :  0.8056265984654731
Test score :  0.7678571428571429
Classification report :
               precision    recall  f1-score   support

           0       0.86      0.74      0.79       239
           1       0.66      0.82      0.73       153

    accuracy                           0.77       392
   macro avg       0.76      0.78      0.76       392
weighted avg       0.79      0.77      0.77       392

Confusion Matrix :
  [[176  63]
 [ 28 125]]
Cross-Validation Scores :
 [0.80191693 0.78913738 0.8115016  0.80830671 0.76602564]
Mean Cross-Validation Score :
 0.7953776521667895
Standard Deviation of Cross-Validation Scores :
 0.016555276341491898


# <span style="color: green">TP2 : Explicabilité et Equité </span>

# Explicabilité

In [58]:
import pandas as pd
from sklearn.inspection import permutation_importance


### Chargement des données de Californie

In [112]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Convert boolean labels to numeric
y_train_numeric = y_train.astype(int)

correlations_original_short = X_train.corrwith(y_train_numeric['PINCP'])

In [113]:
# Convert boolean labels to numeric
y_train_numeric = y_all.astype(int)

correlations_original = X_all.corrwith(y_train_numeric['PINCP'])

### Calcul des corrélations entre les caractéristiques et le label prédit

In [70]:
svm_predictions = bestSVMClassifier.predict(X_train)
rf_predictions = bestRandomForestClassifier.predict(X_train)
adaboost_predictions = bestAdaBoostClassifier.predict(X_train)
gb_predictions = bestGradientBoostingClassifier.predict(X_train)

correlations_svm_predicted = pd.DataFrame(X_train).apply(lambda x: x.corr(pd.Series(svm_predictions)))
correlations_rf_predicted = pd.DataFrame(X_train).apply(lambda x: x.corr(pd.Series(rf_predictions)))
correlations_adaboost_predicted = pd.DataFrame(X_train).apply(lambda x: x.corr(pd.Series(adaboost_predictions)))
correlations_gb_predicted = pd.DataFrame(X_train).apply(lambda x: x.corr(pd.Series(gb_predictions)))


### Évaluation de l'importance de chaque caractéristique avec la méthode permutation_importance

In [71]:
svm_permutation_importance = permutation_importance(bestSVMClassifier, X_train, y_train, n_repeats=30, random_state=42)
rf_permutation_importance = permutation_importance(bestRandomForestClassifier, X_train, y_train, n_repeats=30, random_state=42)
adaboost_permutation_importance = permutation_importance(bestAdaBoostClassifier, X_train, y_train, n_repeats=30, random_state=42)
gb_permutation_importance = permutation_importance(bestGradientBoostingClassifier, X_train, y_train, n_repeats=30, random_state=42)

### Affichage des résultats

In [118]:
print("Corrélations originales pour la partie du dataset utilisée pour les entrainements:")
print(correlations_original_short)
print()
print("Corrélations originales pour tout le dataset:")
print(correlations_original)

Corrélations originales pour la partie du dataset utilisée pour les entrainements:
AGEP     0.260720
COW      0.069211
SCHL     0.350517
MAR     -0.306983
OCCP    -0.368594
POBP    -0.077086
RELP    -0.239775
WKHP     0.342052
SEX     -0.106358
RAC1P   -0.084953
dtype: float64

Corrélations originales pour tout le dataset:
AGEP     0.266304
COW      0.053772
SCHL     0.350178
MAR     -0.266002
OCCP    -0.341304
POBP    -0.085919
RELP    -0.227610
WKHP     0.338208
SEX     -0.118209
RAC1P   -0.100685
dtype: float64


In [76]:

print("\nCorrélations avec label prédit - SVM:")
print(correlations_svm_predicted)


Corrélations avec label prédit - SVM:
AGEP     0.079171
COW      0.391351
SCHL     0.177750
MAR     -0.143740
OCCP    -0.019392
POBP     0.125191
RELP     0.161165
WKHP     0.085394
SEX     -0.106600
RAC1P    0.210267
dtype: float64


In [77]:

print("\nCorrélations avec label prédit - Random Forest:")
print(correlations_rf_predicted)


Corrélations avec label prédit - Random Forest:
AGEP    -0.010025
COW      0.015082
SCHL     0.194945
MAR     -0.063706
OCCP     0.391392
POBP     0.259314
RELP     0.285714
WKHP     0.273309
SEX     -0.188982
RAC1P    0.553134
dtype: float64


In [78]:
print("\nCorrélations avec label prédit - Adaboost:")
print(correlations_adaboost_predicted)


Corrélations avec label prédit - Adaboost:
AGEP    -0.026525
COW      0.279330
SCHL     0.113047
MAR     -0.067420
OCCP     0.041310
POBP     0.037303
RELP     0.188982
WKHP     0.158600
SEX     -0.200000
RAC1P    0.270421
dtype: float64


In [79]:
print("\nCorrélations avec label prédit - Gradient Boosting:")
print(correlations_gb_predicted)


Corrélations avec label prédit - Gradient Boosting:
AGEP    -0.026525
COW      0.279330
SCHL     0.113047
MAR     -0.067420
OCCP     0.041310
POBP     0.037303
RELP     0.188982
WKHP     0.158600
SEX     -0.200000
RAC1P    0.270421
dtype: float64


In [80]:
print("\nImportance des caractéristiques - SVM:")
print(svm_permutation_importance.importances_mean)


Importance des caractéristiques - SVM:
[0.0235081  0.0014919  0.06033674 0.01466326 0.05566922 0.00464621
 0.0081202  0.05511509 0.00850384 0.00407076]


In [81]:

print("\nImportance des caractéristiques - Random Forest:")
print(rf_permutation_importance.importances_mean)


Importance des caractéristiques - Random Forest:
[0.07779199 0.02425405 0.09249787 0.03552856 0.11632566 0.04034527
 0.05769395 0.10206735 0.03910912 0.01815857]


In [82]:
print("\nImportance des caractéristiques - Adaboost:")
print(adaboost_permutation_importance.importances_mean)


Importance des caractéristiques - Adaboost:
[0.01950128 0.         0.04654731 0.00664962 0.05833333 0.
 0.01896846 0.05571185 0.00142796 0.        ]


In [83]:
print("\nImportance des caractéristiques - Gradient Boosting:")
print(gb_permutation_importance.importances_mean)


Importance des caractéristiques - Gradient Boosting:
[0.00748082 0.         0.02542626 0.00720375 0.09904092 0.00127877
 0.00948423 0.04109122 0.00773657 0.        ]


# Equité

In [133]:
from aif360.metrics import BinaryLabelDatasetMetric
from aif360.datasets import BinaryLabelDataset
from aif360.metrics import ClassificationMetric

### La matrice de confusion pour chaque valeur du SEX (et pour chaque modèle)

In [86]:
models = [bestSVMClassifier, bestRandomForestClassifier, bestAdaBoostClassifier, bestGradientBoostingClassifier]
sex_values = [1, 2]

for model in models:
    for sex_value in sex_values:
        # Sous-ensembles basés sur la valeur de 'SEX'
        X_test_subset = X_test[X_test['SEX'] == sex_value]
        y_test_subset = y_test[X_test['SEX'] == sex_value]

        # Prédiction du modèle
        predictions = model.predict(X_test_subset)

        # Calcul de la matrice de confusion
        cm = confusion_matrix(y_test_subset, predictions)
        print(f"Matrice de confusion pour {model.__class__.__name__} avec SEX='{sex_value}':")
        print(cm)

Matrice de confusion pour SVC avec SEX='1':
[[101  15]
 [ 37  63]]
Matrice de confusion pour SVC avec SEX='2':
[[103  11]
 [ 30  32]]
Matrice de confusion pour RandomForestClassifier avec SEX='1':
[[87 29]
 [20 80]]
Matrice de confusion pour RandomForestClassifier avec SEX='2':
[[98 16]
 [20 42]]
Matrice de confusion pour AdaBoostClassifier avec SEX='1':
[[102  14]
 [ 35  65]]
Matrice de confusion pour AdaBoostClassifier avec SEX='2':
[[99 15]
 [19 43]]
Matrice de confusion pour GradientBoostingClassifier avec SEX='1':
[[87 29]
 [24 76]]
Matrice de confusion pour GradientBoostingClassifier avec SEX='2':
[[102  12]
 [ 20  42]]


### Calcul de 2 métriques d'équité statistique en train et en test pour l'attribut sensible étant le SEX

In [171]:
# Fonction de Calcul de 2 métriques d'équité statistique en train et en test pour un attribut sensible
def calculate_fairness_metrics(X, y, type, sensitive_feature = "SEX"):
    dataset = BinaryLabelDataset(df=pd.concat([X, y], axis=1),
                                  label_names=['PINCP'], protected_attribute_names=[sensitive_feature])
   
    metric = BinaryLabelDatasetMetric(dataset, unprivileged_groups=[{sensitive_feature: 1}], privileged_groups=[{sensitive_feature: 2}])
    print(f"\nMétriques d'équité statistique en {type}:")
    print("Disparate Impact:", metric.disparate_impact())
    print("Statistical Parity Difference:", metric.statistical_parity_difference())

In [172]:
# Application
calculate_fairness_metrics(X_train, y_train, "train", 'SEX')
calculate_fairness_metrics(X_test, y_test, "test", 'SEX')


Métriques d'équité statistique en train:
Disparate Impact: 1.3371189917936694
Statistical Parity Difference: 0.11379544026790528

Métriques d'équité statistique en test:
Disparate Impact: 1.1975308641975309
Statistical Parity Difference: 0.07407407407407407


### Apprentissage sans la feature 'SEX' et calcul des métriques d'équité

In [149]:
# Retrait de l'attribut SEX dans la data
# Remarque : Nous pouvons directement utilisé DROP sans avoir à relire les fichiers CSV

X_train_no_sex = X_train.drop('SEX', axis=1)
X_test_no_sex = X_test.drop('SEX', axis=1)

In [123]:
# Réentraînement les modèles sans la feature 'SEX'
svm_model_no_sex = svm.SVC(kernel='linear')
svm_model_no_sex.fit(X_train_no_sex, np.ravel(y_train))

rf_model_no_sex = RandomForestClassifier(max_features='log2', min_samples_split=10, n_estimators=200)
rf_model_no_sex.fit(X_train_no_sex, np.ravel(y_train))

adaboost_model_no_sex = AdaBoostClassifier(learning_rate=0.1)
adaboost_model_no_sex.fit(X_train_no_sex, np.ravel(y_train))

gb_model_no_sex = GradientBoostingClassifier(max_depth=5, min_samples_leaf=0.1, min_samples_split=0.1, n_estimators=14)
gb_model_no_sex.fit(X_train_no_sex, np.ravel(y_train))

In [179]:
# Calcul des métriques d'équité sans la feature 'SEX'
# Exemple de la confusion matrix
displayConfusionMatrix(svm_model_no_sex, X_test_no_sex, y_test)
displayConfusionMatrix(rf_model_no_sex, X_test_no_sex, y_test)
displayConfusionMatrix(adaboost_model_no_sex, X_test_no_sex, y_test)
displayConfusionMatrix(gb_model_no_sex, X_test_no_sex, y_test)

Confusion Matrix :
  [[157  72]
 [115  48]]
Confusion Matrix :
  [[134  95]
 [ 94  69]]
Confusion Matrix :
  [[144  85]
 [112  51]]
Confusion Matrix :
  [[134  95]
 [104  59]]


### La matrice de confusion pour chaque valeur du RAC1P (et pour chaque modèle)

In [182]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=450)

In [164]:
models = [bestSVMClassifier, bestRandomForestClassifier, bestAdaBoostClassifier, bestGradientBoostingClassifier]
race_values = [1, 2, 3, 4, 5, 6, 7, 8, 9]

for model in models:
    for race_value in race_values:
        # Sous-ensembles basés sur la valeur de 'RAC1P'
        X_test_subset = X_all[X_all['RAC1P'] == race_value]
        y_test_subset = y_all[X_all['RAC1P'] == race_value]

        if(y_test_subset.shape[0] > 0):
            # Prédiction du modèle
            predictions = model.predict(X_test_subset)

            # Calcul de la matrice de confusion
            cm = confusion_matrix(y_test_subset, predictions)
            print(f"Matrice de confusion pour {model.__class__.__name__} avec RAC1P='{race_value}':")
            print(cm)

Matrice de confusion pour SVC avec RAC1P='1':
[[59646  7710]
 [20535 33115]]
Matrice de confusion pour SVC avec RAC1P='2':
[[4918  688]
 [1463 1488]]
Matrice de confusion pour SVC avec RAC1P='3':
[[835  92]
 [184 183]]
Matrice de confusion pour SVC avec RAC1P='4':
[[6 2]
 [1 4]]
Matrice de confusion pour SVC avec RAC1P='5':
[[324  30]
 [ 56  40]]
Matrice de confusion pour SVC avec RAC1P='6':
[[14086  2906]
 [ 5128 10589]]
Matrice de confusion pour SVC avec RAC1P='7':
[[399  48]
 [108  82]]
Matrice de confusion pour SVC avec RAC1P='8':
[[17462   903]
 [ 2836  1592]]
Matrice de confusion pour SVC avec RAC1P='9':
[[4663  612]
 [1083 1848]]
Matrice de confusion pour RandomForestClassifier avec RAC1P='1':
[[55361 11995]
 [12165 41485]]
Matrice de confusion pour RandomForestClassifier avec RAC1P='2':
[[4582 1024]
 [ 891 2060]]
Matrice de confusion pour RandomForestClassifier avec RAC1P='3':
[[792 135]
 [108 259]]
Matrice de confusion pour RandomForestClassifier avec RAC1P='4':
[[6 2]
 [0 5]]

### Calcul de 2 métriques d'équité statistique en train et en test pour l'attribut sensible étant le RAC1P

In [174]:
# Fonction de Calcul de 2 métriques d'équité statistique en train et en test pour un attribut sensible
def calculate_fairness_metrics(X, y, type, sensitive_feature = "RAC1P"):
    dataset = BinaryLabelDataset(df=pd.concat([X, y], axis=1),
                                  label_names=['PINCP'], protected_attribute_names=[sensitive_feature])
   
    unprivileged_groups=[
        {sensitive_feature: 2},
        {sensitive_feature: 3},
        {sensitive_feature: 4},
        {sensitive_feature: 5},
        {sensitive_feature: 6},
        {sensitive_feature: 8},
        {sensitive_feature: 9}
    ]
    privileged_groups=[
        {sensitive_feature: 1}
    ]

    metric = BinaryLabelDatasetMetric(dataset, unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups)
    print(f"\nMétriques d'équité statistique en {type}:")
    print("Disparate Impact:", metric.disparate_impact())
    print("Statistical Parity Difference:", metric.statistical_parity_difference())

In [176]:
# Application
calculate_fairness_metrics(X_train, y_train, "train", 'RAC1P')
calculate_fairness_metrics(X_test, y_test, "test", 'RAC1P')


Métriques d'équité statistique en train:
Disparate Impact: 0.8605857230018304
Statistical Parity Difference: -0.05898296334537945

Métriques d'équité statistique en test:
Disparate Impact: 0.9793988517392773
Statistical Parity Difference: -0.008652482269503547


### Apprentissage sans la feature 'RAC1P' et calcul des métriques d'équité

In [183]:
# Retrait de l'attribut RAC1P dans la data
# Remarque : Nous pouvons directement utilisé DROP sans avoir à relire les fichiers CSV

X_train_no_race = X_train.drop('RAC1P', axis=1)
X_test_no_race = X_test.drop('RAC1P', axis=1)

In [184]:
# Réentraînement les modèles sans la feature 'RAC1P'
svm_model_no_race = svm.SVC(kernel='linear')
svm_model_no_race.fit(X_train_no_race, np.ravel(y_train))

rf_model_no_race = RandomForestClassifier(max_features='log2', min_samples_split=10, n_estimators=200)
rf_model_no_race.fit(X_train_no_race, np.ravel(y_train))

adaboost_model_no_race = AdaBoostClassifier(learning_rate=0.1)
adaboost_model_no_race.fit(X_train_no_race, np.ravel(y_train))

gb_model_no_race = GradientBoostingClassifier(max_depth=5, min_samples_leaf=0.1, min_samples_split=0.1, n_estimators=14)
gb_model_no_race.fit(X_train_no_race, np.ravel(y_train))

In [186]:
# Calcul des métriques d'équité sans la feature 'RAC1P'
# Exemple de la confusion matrix
displayConfusionMatrix(svm_model_no_race, X_test_no_race, y_test)
displayConfusionMatrix(rf_model_no_race, X_test_no_race, y_test)
displayConfusionMatrix(adaboost_model_no_race, X_test_no_race, y_test)
displayConfusionMatrix(gb_model_no_race, X_test_no_race, y_test)

Confusion Matrix :
  [[193  36]
 [ 69  94]]
Confusion Matrix :
  [[180  49]
 [ 44 119]]
Confusion Matrix :
  [[190  39]
 [ 56 107]]
Confusion Matrix :
  [[188  41]
 [ 55 108]]
