In [None]:
# ===========================================================================
# DIANI Mamoudou Sékou
#
# Janvier 2022
#===========================================================================

In [None]:
# Importation des bibliothèques nécessaires
from sklearn import preprocessing
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB

In [None]:
# Recupération des données situées dans "C:/Users/diani/Desktop/Apprntissage_auto/DataPrepa1.csv"
# lecture du CSV et affichage des 5 premieres lignes
Pollution_data = pd.read_csv('C:/Users/diani/Desktop/Apprntissage_auto/DataPrepa1.csv', sep = ';', index_col="id")
pd.set_option('display.max_columns', None)
Pollution_data.head(5)

In [None]:
# Normalisation des donnees avec la normalisation z-score
features = ["tLengthOfScreenName", "tLengthOfDescriptionInUserProfile", "tLongevityOfTheAccount", "tNumerOfFollowings", "tNumberOfFollowers", "tSeriesOfNumberOfFollowings", "Following.followers", "tNumberOfTweets", "tNumberOfTweetsByDays", "Tweets.theLongevityOfTheAccount", "TweetURL.tNumberOfTweets", "tNbMoyenUrlByTweet", "UsernameInTweet.tNumberOfTweets", "TimeMoyenBet2ConseTweet", "ValueOfTimeMaxBet2ConseTweet"]
std_scale = preprocessing.StandardScaler().fit(Pollution_data[features])
normalized_data = std_scale.transform(Pollution_data[features])
#print(normalized_data)
labels = np.array(Pollution_data['Status'])

In [None]:
# Separation des donnees en données d'entrenement et des données de teste
train_features, test_features, train_labels, test_labels = train_test_split(normalized_data, labels, test_size = 0.2, random_state = 42)
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

In [None]:
# Cette fonction permet de déterminer les metriques de performance 
# True Positive Rate(TPR), False Positive Rate(FPR) et l'aire sous la courbe ROC (Receiver Operating Characteristic)
def TPR_FPR(test_labels, predicted_labels):
    cnf_matrix = confusion_matrix(test_labels, predicted_labels)
    #print (cnf_matrix)
    FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)
    FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
    TP = np.diag(cnf_matrix)
    TN = cnf_matrix.sum() - (FP + FN + TP)

    FP = FP.astype(float)
    FN = FN.astype(float)
    TP = TP.astype(float)
    TN = TN.astype(float)

    TPR = TP/(TP+FN)
    FPR = FP/(FP+TN)

    fpr, tpr, _ = roc_curve(test_labels, predicted_labels, pos_label=1)
    aire = metrics.auc(fpr, tpr)
    return (TPR, FPR, aire)

In [None]:
str_TPR="True Positive Rate: "
str_FPR="False Positive Rate: "
str_Aire="l'aire sous la courbe ROC: "

In [None]:
# Arbre de décision: entrainement de l'arbre et prédiction sur les données de test
# Et determination de la F-mesure, du FPR, TPR et de l'aire sous la courbe
ad_classifier = DecisionTreeClassifier(random_state=42)
d_tree1 = ad_classifier.fit(train_features,train_labels)
predicted_labels = d_tree1.predict(test_features)

tpr, fpr, ad_auc  =TPR_FPR(test_labels, predicted_labels)
Val_ad_tpr=tpr[1]
Val_ad_fpr=fpr[1]
f1 = f1_score(test_labels, predicted_labels, average='binary', pos_label=1)
print (str_TPR, Val_ad_tpr)
print (str_FPR, Val_ad_fpr)
print (str_Aire, ad_auc)
print("F1-score of decision tree: ", f1)
# True Positive Rate:  0.9237988826815643
# False Positive Rate:  0.09019607843137255
# l'aire sous la courbe ROC:  0.9168014021250959
# F1-score of decision tree: 0.923386196113469

In [None]:
# Random forrest: entrainement et prédiction sur les données de test
# Et determination de la F-mesure, du FPR, TPR et de l'aire sous la courbe
rnd_forest  = RandomForestClassifier()
rnd_forest.fit(train_features,train_labels)
predicted_labels = rnd_forest.predict(test_features)

tpr, fpr, rf_auc  =TPR_FPR(test_labels, predicted_labels)
Val_rf_tpr=tpr[1]
Val_rf_fpr=fpr[1]
f1 = f1_score(test_labels, predicted_labels, average='binary', pos_label=1)
print (str_TPR, Val_rf_tpr)
print (str_FPR, Val_rf_fpr)
print (str_Aire, rf_auc)
print("F1-score of random forrest: ", f1)
# True Positive Rate:  0.9557541899441341
# False Positive Rate:  0.058823529411764705
# l'aire sous la courbe ROC:  0.9484653302661846
# F1-score of random forrest: 0.9536438600401158

In [None]:
# Bagging: entrainement et prédiction sur les données de test
# Et determination de la F-mesure, du FPR, TPR et de l'aire sous la courbe
bagging = BaggingClassifier()
bagging_classifier = bagging.fit(train_features,train_labels)
predicted_labels = bagging_classifier.predict(test_features)

tpr, fpr, bag_auc  =TPR_FPR(test_labels, predicted_labels)
Val_bag_tpr=tpr[1]
Val_bag_fpr=fpr[1]
print (str_TPR,Val_bag_tpr)
print (str_FPR,Val_bag_fpr)
print (str_Aire,bag_auc)
f1 = f1_score(test_labels, predicted_labels, average='binary', pos_label=1)
print("F1-score of bagging: ", f1)
# True Positive Rate:  0.9448044692737431
# False Positive Rate:  0.05908496732026144
# l'aire sous la courbe ROC:  0.9428597509767407
# F1-score of bagging: 0.9475810977663038

In [None]:
# AdaBoost: entrainement et prédiction sur les données de test
# Et determination de la F-mesure, du FPR, TPR et de l'aire sous la courbe
adaB = AdaBoostClassifier()
adaB_classifier = adaB.fit(train_features,train_labels)
predicted_labels = adaB_classifier.predict(test_features)

tpr, fpr, adaB_auc  =TPR_FPR(test_labels, predicted_labels)
Val_adaB_tpr=tpr[1]
Val_adaB_fpr=fpr[1]
print (str_TPR,Val_adaB_tpr)
print (str_FPR,Val_adaB_fpr)
print (str_Aire,adaB_auc)
f1 = f1_score(test_labels, predicted_labels, average='binary', pos_label=1)
print("F1-score of adaBoost: ", f1)
# True Positive Rate:  0.9452513966480447
# False Positive Rate:  0.06745098039215686
# l'aire sous la courbe ROC:  0.9389002081279441
# F1-score of adaBoost: 0.9438803971884413

In [None]:
# Classification avec Naive Bayes: entrainement et prédiction sur les données de test
# Et determination de la F-mesure, du FPR, TPR et de l'aire sous la courbe
bayes_class=GaussianNB()
bayes_classifier = bayes_class.fit(train_features,train_labels)
predicted_labels = bayes_classifier.predict(test_features)

tpr, fpr, nb_auc  =TPR_FPR(test_labels, predicted_labels)
Val_nb_tpr=tpr[1]
Val_nb_fpr=fpr[1]
print (str_TPR,Val_nb_tpr)
print (str_FPR,Val_nb_fpr)
print (str_Aire,nb_auc)
f1 = f1_score(test_labels, predicted_labels, average='binary', pos_label=1)
print("F1-score of Naive bayes: ", f1)
# True Positive Rate:  0.6538547486033519
# False Positive Rate:  0.08418300653594771
# l'aire sous la courbe ROC:  0.7848358710337021
# F1-score of Naive bayes: 0.7577366308429367