In [1]:
import pandas as pd
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn import feature_selection
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.inspection import permutation_importance

In [2]:
X_all = pd.read_csv("./acsincome_ca_features.csv")
y_all = pd.read_csv("./acsincome_ca_labels.csv")

X_all, y_all = shuffle(X_all, y_all, random_state=1)

num_samples = int(len(X_all)*0.01)
X, y = X_all[:num_samples], y_all[:num_samples]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
y_train = y_train['PINCP'].values

X_test = scaler.transform(X_test)
y_test = y_test['PINCP'].values

#  1 - Explicabilité des modèles

### Coefficient de corrélation entre features et label

In [3]:
correlation_label = np.corrcoef(X_test, y_test, rowvar=False)[: -1, -1]
correlation_df = pd.DataFrame({"Feature" : X.columns, "correlation" : correlation_label})

In [4]:
print("Correlation: ")
print(correlation_df)

Correlation: 
  Feature  correlation
0    AGEP     0.264362
1     COW     0.128665
2    SCHL     0.376713
3     MAR    -0.283184
4    OCCP    -0.329998
5    POBP     0.008443
6    RELP    -0.200241
7    WKHP     0.411818
8     SEX    -0.085003
9   RAC1P    -0.040897


### Coefficient de corrélation entre features et label prédit

#### GradientBoosting

In [5]:
GdB_model = GradientBoostingClassifier(criterion= 'friedman_mse', learning_rate= 0.01, loss= 'deviance', n_estimators=500)
GdB_model.fit(X_train, y_train)
y_predict = GdB_model.predict(X_test)

In [6]:
correlation_gdb = np.corrcoef(X_test, y_predict, rowvar=False)[: -1, -1]
correlation_df_gdb = pd.DataFrame({"Feature" : X.columns, "correlation" : correlation_gdb})

In [7]:
print(f"Correlation gdb: \n {correlation_df_gdb}")

Correlation gdb: 
   Feature  correlation
0    AGEP     0.204070
1     COW     0.042642
2    SCHL     0.471732
3     MAR    -0.265245
4    OCCP    -0.589933
5    POBP    -0.119144
6    RELP    -0.273018
7    WKHP     0.414855
8     SEX    -0.051640
9   RAC1P    -0.149184


#### AdaBoosting

In [8]:
AdB_model = AdaBoostClassifier(learning_rate= 0.5, n_estimators= 50)
AdB_model.fit(X_train, y_train)
y_predict = AdB_model.predict(X_test)

In [9]:
correlation_adb = np.corrcoef(X_test, y_predict, rowvar=False)[: -1, -1]
correlation_df_adb = pd.DataFrame({"Feature" : X.columns, "correlation" : correlation_adb})

In [10]:
print(f"Correlation Adb: \n {correlation_df_adb}")

Correlation Adb: 
   Feature  correlation
0    AGEP     0.214495
1     COW     0.087596
2    SCHL     0.488184
3     MAR    -0.284715
4    OCCP    -0.602938
5    POBP    -0.123028
6    RELP    -0.238265
7    WKHP     0.395772
8     SEX    -0.049350
9   RAC1P    -0.179422


#### SVM

In [11]:
svm_model = SVC(C = 1, gamma= 'scale', kernel= 'rbf')
svm_model.fit(X_train, y_train)
y_predict = svm_model.predict(X_test)

In [12]:
correlation_svm = np.corrcoef(X_test, y_predict, rowvar=False)[: -1, -1]
correlation_df_svm = pd.DataFrame({"Feature" : X.columns, "correlation" : correlation_svm})

In [13]:
print(f"Correlation svm: \n {correlation_df_svm}")

Correlation svm: 
   Feature  correlation
0    AGEP     0.207672
1     COW     0.024466
2    SCHL     0.437549
3     MAR    -0.330393
4    OCCP    -0.563285
5    POBP    -0.140495
6    RELP    -0.350600
7    WKHP     0.393914
8     SEX    -0.083008
9   RAC1P    -0.154656


#### Random Forest

In [14]:
rdf_model = RandomForestClassifier(criterion= 'gini', max_depth= 100, min_samples_split= 10, n_estimators= 500)
rdf_model.fit(X_train, y_train)
y_predict = rdf_model.predict(X_test)

In [15]:
correlation_rdf = np.corrcoef(X_test, y_predict, rowvar=False)[: -1, -1]
correlation_df_rdf = pd.DataFrame({"Feature" : X.columns, "correlation" : correlation_rdf})

In [16]:
print(f"Correlation Random Forest: \n {correlation_df_rdf}")

Correlation Random Forest: 
   Feature  correlation
0    AGEP     0.183343
1     COW     0.021266
2    SCHL     0.450501
3     MAR    -0.317877
4    OCCP    -0.569221
5    POBP    -0.154960
6    RELP    -0.283800
7    WKHP     0.408049
8     SEX    -0.057461
9   RAC1P    -0.138398


In [17]:
correlation_df_all = pd.DataFrame({"Feature" : X.columns, 
                                   "label" : correlation_label, 
                                   "Grandient Boosting" : correlation_gdb,
                                   "AdaBoost": correlation_adb,
                                   "SVM": correlation_svm,
                                   "Random Forest" : correlation_rdf})
print(f"Correlation coefficient: \n {correlation_df_all}")

Correlation coefficient: 
   Feature     label  Grandient Boosting  AdaBoost       SVM  Random Forest
0    AGEP  0.264362            0.204070  0.214495  0.207672       0.183343
1     COW  0.128665            0.042642  0.087596  0.024466       0.021266
2    SCHL  0.376713            0.471732  0.488184  0.437549       0.450501
3     MAR -0.283184           -0.265245 -0.284715 -0.330393      -0.317877
4    OCCP -0.329998           -0.589933 -0.602938 -0.563285      -0.569221
5    POBP  0.008443           -0.119144 -0.123028 -0.140495      -0.154960
6    RELP -0.200241           -0.273018 -0.238265 -0.350600      -0.283800
7    WKHP  0.411818            0.414855  0.395772  0.393914       0.408049
8     SEX -0.085003           -0.051640 -0.049350 -0.083008      -0.057461
9   RAC1P -0.040897           -0.149184 -0.179422 -0.154656      -0.138398


### permutation_importance

#### SVM

In [18]:
svm_model = SVC(C = 1, gamma= 'auto', kernel= 'rbf')
svm_model.fit(X_train, y_train)
permutation_result = permutation_importance(svm_model, X_test, y_test)

In [19]:
mean_permutation_result_svm = pd.DataFrame({"Feature" : X.columns, "permutation" :permutation_result.importances_mean})
print(f"Mean of feature importance over 5 repeats \n {mean_permutation_result_svm}")

Mean of feature importance over 5 repeats 
   Feature   permutation
0    AGEP -4.081633e-03
1     COW -9.693878e-03
2    SCHL  4.540816e-02
3     MAR  1.530612e-02
4    OCCP  2.806122e-02
5    POBP -5.612245e-03
6    RELP  1.173469e-02
7    WKHP  5.459184e-02
8     SEX  1.377551e-02
9   RAC1P -6.661338e-17


#### TO DO : the same for the AdaBost, GradientBoost and RandomForest

#### AdaBoost

In [20]:
adb_model = AdaBoostClassifier(learning_rate= 0.5, n_estimators= 50)
adb_model.fit(X_train, y_train)
permutation_result = permutation_importance(adb_model, X_test, y_test)

In [21]:
mean_permutation_result_adb = pd.DataFrame({"Feature" : X.columns, "permutation" :permutation_result.importances_mean})
print(f"Mean of feature importance over 5 repeats \n {mean_permutation_result_adb}")

Mean of feature importance over 5 repeats 
   Feature  permutation
0    AGEP     0.008163
1     COW    -0.004082
2    SCHL     0.019898
3     MAR     0.002551
4    OCCP     0.009694
5    POBP     0.004082
6    RELP     0.002041
7    WKHP     0.043367
8     SEX     0.002551
9   RAC1P     0.000000


#### GradientBoosting

In [22]:
gdb_model = GradientBoostingClassifier(criterion= 'friedman_mse', learning_rate= 0.01, loss= 'deviance', n_estimators=500)
gdb_model.fit(X_train, y_train)
permutation_result = permutation_importance(gdb_model, X_test, y_test)

In [23]:
mean_permutation_result_gdb = pd.DataFrame({"Feature" : X.columns, "permutation" :permutation_result.importances_mean})
print(f"Mean of feature importance over 5 repeats \n {mean_permutation_result_gdb}")

Mean of feature importance over 5 repeats 
   Feature  permutation
0    AGEP     0.007653
1     COW     0.008673
2    SCHL     0.042857
3     MAR     0.003061
4    OCCP     0.044898
5    POBP    -0.003061
6    RELP     0.010204
7    WKHP     0.050510
8     SEX     0.003061
9   RAC1P     0.000000


#### RandomForest

In [24]:
rdf_model = RandomForestClassifier(criterion= 'gini', max_depth= 100, min_samples_split= 10, n_estimators= 500)
rdf_model.fit(X_train, y_train)
permutation_result = permutation_importance(rdf_model, X_test, y_test)

In [25]:
mean_permutation_result_rdf = pd.DataFrame({"Feature" : X.columns, "permutation" :permutation_result.importances_mean})
print(f"Mean of feature importance over 5 repeats \n {mean_permutation_result_rdf}")

Mean of feature importance over 5 repeats 
   Feature  permutation
0    AGEP     0.009184
1     COW     0.004592
2    SCHL     0.036224
3     MAR     0.002551
4    OCCP     0.054082
5    POBP     0.003061
6    RELP     0.015816
7    WKHP     0.050000
8     SEX    -0.004592
9   RAC1P     0.000510


In [26]:
permutation_df_all = pd.DataFrame({"Feature" : X.columns,  
                                   "Grandient Boosting" : mean_permutation_result_gdb.permutation,
                                   "AdaBoost": mean_permutation_result_adb.permutation,
                                   "SVM": mean_permutation_result_svm.permutation,
                                   "Random Forest" : mean_permutation_result_rdf.permutation})
print(f"Permutation importance: \n {permutation_df_all}")

Permutation importance: 
   Feature  Grandient Boosting  AdaBoost           SVM  Random Forest
0    AGEP            0.007653  0.008163 -4.081633e-03       0.009184
1     COW            0.008673 -0.004082 -9.693878e-03       0.004592
2    SCHL            0.042857  0.019898  4.540816e-02       0.036224
3     MAR            0.003061  0.002551  1.530612e-02       0.002551
4    OCCP            0.044898  0.009694  2.806122e-02       0.054082
5    POBP           -0.003061  0.004082 -5.612245e-03       0.003061
6    RELP            0.010204  0.002041  1.173469e-02       0.015816
7    WKHP            0.050510  0.043367  5.459184e-02       0.050000
8     SEX            0.003061  0.002551  1.377551e-02      -0.004592
9   RAC1P            0.000000  0.000000 -6.661338e-17       0.000510


# 2 - Equité des modèles

In [27]:
def calcul_rappel(confusion_matrix):
    return confusion_matrix[1][1]/(confusion_matrix[1][1]+confusion_matrix[1,0])

In [28]:
def calcul_precision(confusion_matrix):
    return confusion_matrix[1][1]/(confusion_matrix[1][1]+confusion_matrix[0][1])

In [29]:
def cal_equite(confusion_matrix):
    print(f"Rappel = { calcul_rappel(confusion_matrix) }" )
    print(f"Précision = { calcul_precision(confusion_matrix) }" )
    

### Data without male

In [30]:
X_all = pd.read_csv("acsincome_ca_features.csv")
y_all = pd.read_csv("acsincome_ca_labels.csv")
index_male = X_all[X_all['SEX'] == 1].index

X_all_without_male = X_all.drop(index_male)
y_all_without_male = y_all.drop(index_male)

X_all, y_all = shuffle(X_all_without_male, y_all_without_male, random_state=1)

num_samples = int(len(X_all)*0.02)
X, y = X_all[:num_samples], y_all[:num_samples]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
y_train = y_train['PINCP'].values

X_test = scaler.transform(X_test)
y_test = y_test['PINCP'].values

### SVM

In [31]:
svm_model = SVC(C = 1, gamma= 'auto', kernel= 'rbf')
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)
confusion_matrix_without_male = confusion_matrix(y_test,y_pred)
print("Confusion matrix on test: ")
print(confusion_matrix_without_male)
cal_equite(confusion_matrix_without_male)

print("--------------------------------------------")

y_pred = svm_model.predict(X_train)
confusion_matrix_without_male = confusion_matrix(y_train,y_pred)
print("Confusion matrix on train: ")
print(confusion_matrix_without_male)
cal_equite(confusion_matrix_without_male)

Confusion matrix on test: 
[[215  35]
 [ 36  84]]
Rappel = 0.7
Précision = 0.7058823529411765
--------------------------------------------
Confusion matrix on train: 
[[845 107]
 [146 379]]
Rappel = 0.7219047619047619
Précision = 0.779835390946502


__Rappel__ : True Positive Rate (Rappel élévé : beaucoup de vrais positifs)

Rappel = TP/(TP+FN) 

__Précision__ : (Précision élevé : peu de faux positifs)

Précision = TP/(TP+FP)

Dans notre cas on a :

Rappel = 87/(87+33) = 0,73

Précision = 87/(87+27) = 0,76

### Random Forest

In [32]:
rdf_model = RandomForestClassifier(criterion= 'gini', max_depth= 100, min_samples_split= 10, n_estimators= 500)
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)
confusion_matrix_without_male = confusion_matrix(y_test,y_pred)
print("Confusion matrix on test: ")
print(confusion_matrix_without_male)
cal_equite(confusion_matrix_without_male)

print("--------------------------------------------")

y_pred = svm_model.predict(X_train)
confusion_matrix_without_male = confusion_matrix(y_train,y_pred)
print("Confusion matrix on train: ")
print(confusion_matrix_without_male)
cal_equite(confusion_matrix_without_male)

Confusion matrix on test: 
[[215  35]
 [ 36  84]]
Rappel = 0.7
Précision = 0.7058823529411765
--------------------------------------------
Confusion matrix on train: 
[[845 107]
 [146 379]]
Rappel = 0.7219047619047619
Précision = 0.779835390946502


### Gradient Boosting

In [33]:
gdb_model = GradientBoostingClassifier(criterion= 'friedman_mse', learning_rate= 0.01, loss= 'deviance', n_estimators=500)
gdb_model.fit(X_train, y_train)
y_pred = gdb_model.predict(X_test)
confusion_matrix_without_male = confusion_matrix(y_test,y_pred)
print("Confusion matrix on test: ")
print(confusion_matrix_without_male)
cal_equite(confusion_matrix_without_male)

print("--------------------------------------------")

y_pred = gdb_model.predict(X_train)
confusion_matrix_without_male = confusion_matrix(y_train,y_pred)
print("Confusion matrix on train: ")
print(confusion_matrix_without_male)
cal_equite(confusion_matrix_without_male)

Confusion matrix on test: 
[[223  27]
 [ 39  81]]
Rappel = 0.675
Précision = 0.75
--------------------------------------------
Confusion matrix on train: 
[[866  86]
 [126 399]]
Rappel = 0.76
Précision = 0.822680412371134


### AdaBoost

In [34]:
adb_model = AdaBoostClassifier(learning_rate= 0.5, n_estimators= 50)
adb_model.fit(X_train, y_train)
y_pred = adb_model.predict(X_test)
confusion_matrix_without_male = confusion_matrix(y_test,y_pred)
print("Confusion matrix on test: ")
print(confusion_matrix_without_male)
cal_equite(confusion_matrix_without_male)

print("--------------------------------------------")

y_pred = adb_model.predict(X_train)
confusion_matrix_without_male = confusion_matrix(y_train,y_pred)
print("Confusion matrix on train: ")
print(confusion_matrix_without_male)
cal_equite(confusion_matrix_without_male)

Confusion matrix on test: 
[[225  25]
 [ 38  82]]
Rappel = 0.6833333333333333
Précision = 0.7663551401869159
--------------------------------------------
Confusion matrix on train: 
[[834 118]
 [137 388]]
Rappel = 0.7390476190476191
Précision = 0.766798418972332


### Data without female

In [35]:
X_all = pd.read_csv("acsincome_ca_features.csv")
y_all = pd.read_csv("acsincome_ca_labels.csv")
index_female = X_all[X_all['SEX'] == 2].index

X_all_without_female = X_all.drop(index_female)
y_all_without_female = y_all.drop(index_female)

X_all, y_all = shuffle(X_all_without_female, y_all_without_female, random_state=1)

num_samples = int(len(X_all)*0.02)
X, y = X_all[:num_samples], y_all[:num_samples]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
y_train = y_train['PINCP'].values

X_test = scaler.transform(X_test)
y_test = y_test['PINCP'].values

### SVM

In [36]:
svm_model = SVC(C = 1, gamma= 'auto', kernel= 'rbf')
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)
confusion_matrix_without_female = confusion_matrix(y_test,y_pred)
print("Confusion matrix on test: ")
print(confusion_matrix_without_female)
cal_equite(confusion_matrix_without_female)

print("--------------------------------------------")

y_pred = svm_model.predict(X_train)
confusion_matrix_without_female = confusion_matrix(y_train,y_pred)
print("Confusion matrix on train: ")
print(confusion_matrix_without_female)
cal_equite(confusion_matrix_without_female)

Confusion matrix on test: 
[[180  46]
 [ 40 148]]
Rappel = 0.7872340425531915
Précision = 0.7628865979381443
--------------------------------------------
Confusion matrix on train: 
[[777 123]
 [147 605]]
Rappel = 0.8045212765957447
Précision = 0.8310439560439561


### Random Forest

In [37]:
rdf_model = RandomForestClassifier(criterion= 'gini', max_depth= 100, min_samples_split= 10, n_estimators= 500)
rdf_model.fit(X_train, y_train)
y_pred = rdf_model.predict(X_test)
confusion_matrix_without_female = confusion_matrix(y_test,y_pred)
print("Confusion matrix on test: ")
print(confusion_matrix_without_female)
cal_equite(confusion_matrix_without_female)

print("--------------------------------------------")

y_pred = rdf_model.predict(X_train)
confusion_matrix_without_female = confusion_matrix(y_train,y_pred)
print("Confusion matrix on train: ")
print(confusion_matrix_without_female)
cal_equite(confusion_matrix_without_female)

Confusion matrix on test: 
[[186  40]
 [ 42 146]]
Rappel = 0.776595744680851
Précision = 0.7849462365591398
--------------------------------------------
Confusion matrix on train: 
[[848  52]
 [ 62 690]]
Rappel = 0.9175531914893617
Précision = 0.9299191374663073


### Gradient Boost

In [38]:
gdb_model = GradientBoostingClassifier(criterion= 'friedman_mse', learning_rate= 0.01, loss= 'deviance', n_estimators=500)
gdb_model.fit(X_train, y_train)
y_pred = gdb_model.predict(X_test)
confusion_matrix_without_female = confusion_matrix(y_test,y_pred)
print("Confusion matrix on test: ")
print(confusion_matrix_without_female)
cal_equite(confusion_matrix_without_female)

print("--------------------------------------------")

y_pred = gdb_model.predict(X_train)
confusion_matrix_without_female = confusion_matrix(y_train,y_pred)
print("Confusion matrix on train: ")
print(confusion_matrix_without_female)
cal_equite(confusion_matrix_without_female)

Confusion matrix on test: 
[[187  39]
 [ 43 145]]
Rappel = 0.7712765957446809
Précision = 0.7880434782608695
--------------------------------------------
Confusion matrix on train: 
[[774 126]
 [132 620]]
Rappel = 0.824468085106383
Précision = 0.8310991957104558


### AdaBoost

In [39]:
adb_model = AdaBoostClassifier(learning_rate= 0.5, n_estimators= 50)
adb_model.fit(X_train, y_train)
y_pred = adb_model.predict(X_test)
confusion_matrix_without_female = confusion_matrix(y_test,y_pred)
print("Confusion matrix on test: ")
print(confusion_matrix_without_female)
cal_equite(confusion_matrix_without_female)

print("--------------------------------------------")

y_pred = adb_model.predict(X_train)
confusion_matrix_without_female = confusion_matrix(y_train,y_pred)
print("Confusion matrix on train: ")
print(confusion_matrix_without_female)
cal_equite(confusion_matrix_without_female)

Confusion matrix on test: 
[[181  45]
 [ 42 146]]
Rappel = 0.776595744680851
Précision = 0.7643979057591623
--------------------------------------------
Confusion matrix on train: 
[[749 151]
 [138 614]]
Rappel = 0.8164893617021277
Précision = 0.8026143790849674


### Métrique d'équité statique = Rappel(Taux de vrai positif) et Précision 

### Data without 'SEX'

In [40]:
X_all = pd.read_csv("acsincome_ca_features.csv")
y_all = pd.read_csv("acsincome_ca_labels.csv")

X_all_without_SEX = X_all.drop(columns=['SEX'])
y_all_without_SEX = y_all

X_all, y_all = shuffle(X_all_without_SEX, y_all_without_SEX, random_state=1)

num_samples = int(len(X_all)*0.02)
X, y = X_all[:num_samples], y_all[:num_samples]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
y_train = y_train['PINCP'].values

X_test = scaler.transform(X_test)
y_test = y_test['PINCP'].values

#### SVM

In [41]:
svm_model = SVC(C = 1, gamma= 'auto', kernel= 'rbf')
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)
confusion_matrix_without_SEX = confusion_matrix(y_test,y_pred)
print("Confusion matrix on test: ")
print(confusion_matrix_without_SEX)
cal_equite(confusion_matrix_without_SEX)

print("--------------------------------------------")

y_pred = svm_model.predict(X_train)
confusion_matrix_without_SEX = confusion_matrix(y_train,y_pred)
print("Confusion matrix on train: ")
print(confusion_matrix_without_SEX)
cal_equite(confusion_matrix_without_SEX)

Confusion matrix on test: 
[[396  64]
 [ 85 238]]
Rappel = 0.7368421052631579
Précision = 0.7880794701986755
--------------------------------------------
Confusion matrix on train: 
[[1551  294]
 [ 294  991]]
Rappel = 0.7712062256809339
Précision = 0.7712062256809339


#### Random Forest

In [42]:
rdf_model = RandomForestClassifier(criterion= 'gini', max_depth= 100, min_samples_split= 10, n_estimators= 500)
rdf_model.fit(X_train, y_train)
y_pred = rdf_model.predict(X_test)
confusion_matrix_without_SEX = confusion_matrix(y_test,y_pred)
print("Confusion matrix on test: ")
print(confusion_matrix_without_SEX)
cal_equite(confusion_matrix_without_SEX)

print("--------------------------------------------")

y_pred = rdf_model.predict(X_train)
confusion_matrix_without_SEX = confusion_matrix(y_train,y_pred)
print("Confusion matrix on train: ")
print(confusion_matrix_without_SEX)
cal_equite(confusion_matrix_without_SEX)

Confusion matrix on test: 
[[402  58]
 [ 87 236]]
Rappel = 0.7306501547987616
Précision = 0.8027210884353742
--------------------------------------------
Confusion matrix on train: 
[[1742  103]
 [ 103 1182]]
Rappel = 0.9198443579766536
Précision = 0.9198443579766536


#### Gradient Boosting

In [43]:
gdb_model = GradientBoostingClassifier(criterion= 'friedman_mse', learning_rate= 0.01, loss= 'deviance', n_estimators=500)
gdb_model.fit(X_train, y_train)
y_pred = gdb_model.predict(X_test)
confusion_matrix_without_SEX = confusion_matrix(y_test,y_pred)
print("Confusion matrix on test: ")
print(confusion_matrix_without_SEX)
cal_equite(confusion_matrix_without_SEX)

print("--------------------------------------------")

y_pred = gdb_model.predict(X_train)
confusion_matrix_without_SEX = confusion_matrix(y_train,y_pred)
print("Confusion matrix on train: ")
print(confusion_matrix_without_SEX)
cal_equite(confusion_matrix_without_SEX)

Confusion matrix on test: 
[[399  61]
 [104 219]]
Rappel = 0.6780185758513931
Précision = 0.7821428571428571
--------------------------------------------
Confusion matrix on train: 
[[1604  241]
 [ 299  986]]
Rappel = 0.7673151750972763
Précision = 0.8035859820700897


#### AdaBoost

In [44]:
adb_model = AdaBoostClassifier(learning_rate= 0.5, n_estimators= 50)
adb_model.fit(X_train, y_train)
y_pred = adb_model.predict(X_test)
confusion_matrix_without_SEX = confusion_matrix(y_test,y_pred)
print("Confusion matrix on test: ")
print(confusion_matrix_without_SEX)
cal_equite(confusion_matrix_without_SEX)

print("--------------------------------------------")

y_pred = adb_model.predict(X_train)
confusion_matrix_without_SEX = confusion_matrix(y_train,y_pred)
print("Confusion matrix on train: ")
print(confusion_matrix_without_SEX)
cal_equite(confusion_matrix_without_SEX)

Confusion matrix on test: 
[[400  60]
 [112 211]]
Rappel = 0.653250773993808
Précision = 0.7785977859778598
--------------------------------------------
Confusion matrix on train: 
[[1578  267]
 [ 319  966]]
Rappel = 0.7517509727626459
Précision = 0.7834549878345499
