In [1]:
import pandas as pd
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier

# Préparation des données

X_all = pd.read_csv("acsincome_ca_features.csv")
y_all = pd.read_csv("acsincome_ca_labels.csv")

X_all, y_all = shuffle(X_all, y_all, random_state=1)

# only use the first N samples to limit training time
num_samples = int(len(X_all)*0.01)
X, y = X_all[:num_samples], y_all[:num_samples]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
y_train = y_train['PINCP'].values

X_test = scaler.transform(X_test)
y_test = y_test['PINCP'].values

# 1 - Comparaison des différents modèles

## SVM

In [2]:
svm_model = SVC()

kf = KFold(n_splits=5, shuffle=True, random_state=1)

parameters = {
    'C': [ 0.1, 1, 10, 10],
    'kernel' : [ 'poly', 'rbf', 'sigmoid'],
    'gamma' : ['scale', 'auto']
}

grid_search_svm = GridSearchCV(estimator=svm_model, param_grid=parameters, scoring='accuracy', cv=kf)

grid_search_svm.fit(X_train, y_train)

# Meilleurs hyperparamètres 
print("Meilleurs hyperparamètres : ", grid_search_svm.best_params_)

# Prédication avec les meilleurs paramètres
y_predict = grid_search_svm.predict(X_test)

#Accuracy
accuracy = accuracy_score(y_test, y_predict)
print("Accuracy : ", accuracy)

#Rapport de classification
class_report = classification_report(y_test, y_predict)
print(" Rapport de classification : ")
print(class_report)

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_predict)
print("Confusion matrix : ")
print(conf_matrix)

Meilleurs hyperparamètres :  {'C': 1, 'gamma': 'auto', 'kernel': 'rbf'}
Accuracy :  0.7551020408163265
 Rapport de classification : 
              precision    recall  f1-score   support

       False       0.76      0.81      0.79       220
        True       0.74      0.68      0.71       172

    accuracy                           0.76       392
   macro avg       0.75      0.75      0.75       392
weighted avg       0.75      0.76      0.75       392

Confusion matrix : 
[[179  41]
 [ 55 117]]


## Random Forest

In [3]:
rdf_model = RandomForestClassifier()

kf = KFold(n_splits=5, shuffle=True, random_state=1)

parameters = {
    'n_estimators': [10, 100, 500],  
    'criterion': ['gini', 'entropy'],  
    'max_depth': [10, 100, 500],
    'min_samples_split': [2, 10, 50, 100] 
}

grid_search_rdf = GridSearchCV(estimator=rdf_model, param_grid=parameters, scoring='accuracy', cv=kf, )

grid_search_rdf.fit(X_train, y_train)

# Meilleurs hyperparamètres 
print("Meilleurs hyperparamètres : ", grid_search_rdf.best_params_)

# Résultats avec Train
y_predict = grid_search_rdf.predict(X_test)

#Accuracy
accuracy = accuracy_score(y_test, y_predict)
print("Accuracy : ", accuracy)

#Rapport de classification
class_report = classification_report(y_test, y_predict)
print(" Rapport de classification : ")
print(class_report)

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_predict)
print("Confusion matrix : ")
print(conf_matrix)

Meilleurs hyperparamètres :  {'criterion': 'gini', 'max_depth': 500, 'min_samples_split': 10, 'n_estimators': 100}
Accuracy :  0.7780612244897959
 Rapport de classification : 
              precision    recall  f1-score   support

       False       0.79      0.82      0.81       220
        True       0.76      0.72      0.74       172

    accuracy                           0.78       392
   macro avg       0.78      0.77      0.77       392
weighted avg       0.78      0.78      0.78       392

Confusion matrix : 
[[181  39]
 [ 48 124]]


## AdaBoost

In [4]:
AdB_model = AdaBoostClassifier()

kf = KFold(n_splits=5, shuffle=True, random_state=1)

parameters = {
    'n_estimators': [10, 50, 200],
    'learning_rate': [0.5, 1.0, 10.0],
}

grid_search_adb = GridSearchCV(estimator=AdB_model, param_grid=parameters, scoring='accuracy', cv=kf, )

grid_search_adb.fit(X_train, y_train)

# Meilleurs hyperparamètres 
print("Meilleurs hyperparamètres : ", grid_search_adb.best_params_)

# Résultats avec Train
y_predict = grid_search_adb.predict(X_test)

#Accuracy
accuracy = accuracy_score(y_test, y_predict)
print("Accuracy : ", accuracy)

#Rapport de classification
class_report = classification_report(y_test, y_predict)
print(" Rapport de classification : ")
print(class_report)

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_predict)
print("Confusion matrix : ")
print(conf_matrix)

Meilleurs hyperparamètres :  {'learning_rate': 0.5, 'n_estimators': 50}
Accuracy :  0.7678571428571429
 Rapport de classification : 
              precision    recall  f1-score   support

       False       0.77      0.84      0.80       220
        True       0.76      0.68      0.72       172

    accuracy                           0.77       392
   macro avg       0.77      0.76      0.76       392
weighted avg       0.77      0.77      0.77       392

Confusion matrix : 
[[184  36]
 [ 55 117]]


## GrandientBoosting

In [5]:
GdB_model = GradientBoostingClassifier()

kf = KFold(n_splits=5, shuffle=True, random_state=1)

parameters = {
    'loss': ['deviance', 'exponential'],
    'n_estimators': [50, 200, 500],
    'learning_rate': [0.01, 0.1, 0.5],
    'criterion': ['friedman_mse', 'mse'], 
}

grid_search_gdb = GridSearchCV(estimator=GdB_model, param_grid=parameters, scoring='accuracy', cv=kf)


from sklearn.exceptions import FitFailedWarning
import warnings
warnings.filterwarnings('error', category=FitFailedWarning)

try:
    grid_search_gdb.fit(X_train, y_train)
except FitFailedWarning as e:
    print("FitFailedWarning:", e)

# Meilleurs hyperparamètres 
print("Meilleurs hyperparamètres : ", grid_search_gdb.best_params_)

# Résultats avec Train
y_predict = grid_search_gdb.predict(X_test)

#Accuracy
accuracy = accuracy_score(y_test, y_predict)
print("Accuracy : ", accuracy)

#Rapport de classification
class_report = classification_report(y_test, y_predict)
print(" Rapport de classification : ")
print(class_report)

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_predict)
print("Confusion matrix : ")
print(conf_matrix)

Meilleurs hyperparamètres :  {'criterion': 'friedman_mse', 'learning_rate': 0.01, 'loss': 'deviance', 'n_estimators': 500}
Accuracy :  0.7806122448979592
 Rapport de classification : 
              precision    recall  f1-score   support

       False       0.79      0.84      0.81       220
        True       0.77      0.71      0.74       172

    accuracy                           0.78       392
   macro avg       0.78      0.77      0.77       392
weighted avg       0.78      0.78      0.78       392

Confusion matrix : 
[[184  36]
 [ 50 122]]


# 2 - Application des modèles entrâinés sur d'autres dataset

## Jeu de données de Nevada

In [6]:
# Préparation des données
X_all = pd.read_csv("./acsincome_ne_allfeaturesTP2.csv")
y_all = pd.read_csv("./acsincome_ne_labelTP2.csv")

X_all, y_all = shuffle(X_all, y_all, random_state=1)

# only use the first N samples to limit training time
num_samples = int(len(X_all)*0.01)
X, y = X_all[:num_samples], y_all[:num_samples]

X_scaled_ne = scaler.transform(X)
y_scaled_ne = y['PINCP'].values

In [7]:
y_predict_svm_ne = grid_search_svm.predict(X_scaled_ne)
y_predict_rdf_ne = grid_search_rdf.predict(X_scaled_ne)
y_predict_adb_ne = grid_search_adb.predict(X_scaled_ne)
y_predict_gdb_ne = grid_search_gdb.predict(X_scaled_ne)

#Accuracy
accuracy = accuracy_score(y_scaled_ne, y_predict_svm_ne)
print("SVM Accuracy : ", accuracy)
accuracy = accuracy_score(y_scaled_ne, y_predict_rdf_ne)
print("RDF Accuracy : ", accuracy)
accuracy = accuracy_score(y_scaled_ne, y_predict_adb_ne)
print("ADB Accuracy : ", accuracy)
accuracy = accuracy_score(y_scaled_ne, y_predict_gdb_ne)
print("GDB Accuracy : ", accuracy)

# Confusion matrix
accuracy = confusion_matrix(y_scaled_ne, y_predict_svm_ne)
print("SVM Confusion matrix : ")
print(conf_matrix)
accuracy = confusion_matrix(y_scaled_ne, y_predict_rdf_ne)
print("RDF Confusion matrix : ")
print(conf_matrix)
accuracy = confusion_matrix(y_scaled_ne, y_predict_adb_ne)
print("ADB Confusion matrix : ")
print(conf_matrix)
accuracy = confusion_matrix(y_scaled_ne, y_predict_gdb_ne)
print("GDB Confusion matrix : ")
print(conf_matrix)



SVM Accuracy :  0.7850467289719626
RDF Accuracy :  0.7476635514018691
ADB Accuracy :  0.7383177570093458
GDB Accuracy :  0.7383177570093458
SVM Confusion matrix : 
[[184  36]
 [ 50 122]]
RDF Confusion matrix : 
[[184  36]
 [ 50 122]]
ADB Confusion matrix : 
[[184  36]
 [ 50 122]]
GDB Confusion matrix : 
[[184  36]
 [ 50 122]]


## Jeu de données de Colorado

In [8]:
# Préparation des données
X_all = pd.read_csv("acsincome_co_allfeaturesTP2.csv")
y_all = pd.read_csv("acsincome_co_labelTP2.csv")

X_all, y_all = shuffle(X_all, y_all, random_state=1)

# only use the first N samples to limit training time
num_samples = int(len(X_all)*0.01)
X, y = X_all[:num_samples], y_all[:num_samples]

X_scaled_co = scaler.transform(X)
y_scaled_co = y['PINCP'].values

In [9]:
y_predict_svm_co = grid_search_svm.predict(X_scaled_co)
y_predict_rdf_co = grid_search_rdf.predict(X_scaled_co)
y_predict_adb_co = grid_search_adb.predict(X_scaled_co)
y_predict_gdb_co = grid_search_gdb.predict(X_scaled_co)

#Accuracy
accuracy = accuracy_score(y_scaled_co, y_predict_svm_co)
print("SVM Accuracy : ", accuracy)
accuracy = accuracy_score(y_scaled_co, y_predict_rdf_co)
print("RDF Accuracy : ", accuracy)
accuracy = accuracy_score(y_scaled_co, y_predict_adb_co)
print("ADB Accuracy : ", accuracy)
accuracy = accuracy_score(y_scaled_co, y_predict_gdb_co)
print("GDB Accuracy : ", accuracy)

# Confusion matrix
accuracy = confusion_matrix(y_scaled_co, y_predict_svm_co)
print("SVM Confusion matrix : ")
print(conf_matrix)
accuracy = confusion_matrix(y_scaled_co, y_predict_rdf_co)
print("RDF Confusion matrix : ")
print(conf_matrix)
accuracy = confusion_matrix(y_scaled_co, y_predict_adb_co)
print("ADB Confusion matrix : ")
print(conf_matrix)
accuracy = confusion_matrix(y_scaled_co, y_predict_gdb_co)
print("GDB Confusion matrix : ")
print(conf_matrix)

SVM Accuracy :  0.7507987220447284
RDF Accuracy :  0.7507987220447284
ADB Accuracy :  0.7380191693290735
GDB Accuracy :  0.7539936102236422
SVM Confusion matrix : 
[[184  36]
 [ 50 122]]
RDF Confusion matrix : 
[[184  36]
 [ 50 122]]
ADB Confusion matrix : 
[[184  36]
 [ 50 122]]
GDB Confusion matrix : 
[[184  36]
 [ 50 122]]
