In [None]:
# ===========================================================================
# DIANI Mamoudou Sékou
#
# Fevrier 2022
#===========================================================================


In [None]:
# Importation des bibliothèques nécessaires
import pandas as pd
from sklearn import preprocessing
from sklearn.feature_selection import mutual_info_classif
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import f1_score
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier

In [None]:
# Recupération des données situées dans "C:/Users/diani/Desktop/Apprntissage_auto/DataPrepa1.csv"
# lecture du CSV et affichage des 5 premieres lignes
Pollution_data = pd.read_csv('C:/Users/diani/Desktop/Apprntissage_auto/DataPrepa1.csv', sep = ';', index_col="id")
pd.set_option('display.max_columns', None)
Pollution_data.head(5)

In [None]:
# Normalisation des donnees avec la normalisation z-score
features = ["tLengthOfScreenName", "tLengthOfDescriptionInUserProfile", "tLongevityOfTheAccount", "tNumerOfFollowings", "tNumberOfFollowers", "tSeriesOfNumberOfFollowings", "Following.followers", "tNumberOfTweets", "tNumberOfTweetsByDays", "Tweets.theLongevityOfTheAccount", "TweetURL.tNumberOfTweets", "tNbMoyenUrlByTweet", "UsernameInTweet.tNumberOfTweets", "TimeMoyenBet2ConseTweet", "ValueOfTimeMaxBet2ConseTweet"]
std_scale = preprocessing.StandardScaler().fit(Pollution_data[features])
normalized_data = std_scale.transform(Pollution_data[features])
#print(normalized_data)
labels = np.array(Pollution_data['Status'])

In [None]:
# Recherche des 7 meilleures features avec le gain d'information
mutuals = mutual_info_classif(normalized_data, labels)

# Recupération des donnees et tries du plus imortant au moins important
mutual_info = []
for i in range (len(mutuals)):
    mutual_info.append([features[i], mutuals[i]])
res = sorted(mutual_info, key = lambda x : x[1], reverse=True)


#Affichage des 7 meilleurs features et de leurs scores chi2 
liste_newFeatures=[]
for i in range (7):
    x=res[i]
    liste_newFeatures.append(x[0])
print(liste_newFeatures)
position=[]
for i in range (len(liste_newFeatures)):
    for j in range (len(features)):
        if liste_newFeatures[i] == features[j]:
            position.append(j)
print (position)

### Elements du dataset à supprimer
l=[]
a=0
for i in range (len(features)):
    if i in position:
        a=a+1
    else:
        l.append(i)
#print (l)



#Determination de la nouvelle liste de données
newList=[]
for i in range(len(normalized_data)):
    myArray = np.array(normalized_data[i])
    modifiedArray = np.delete(myArray, l)
    newList.append(modifiedArray)
normalized_data=newList



In [None]:
# Separation des donnees en données d'entrenement et des données de test
train_features, test_features, train_labels, test_labels = train_test_split(normalized_data, labels, test_size = 0.2, random_state = 42)

In [None]:
# Cette fonction permet de déterminer les metriques de performance 
# True Positive Rate(TPR), False Positive Rate(FPR) et l'aire sous la courbe ROC (Receiver Operating Characteristic)
def TPR_FPR(test_labels, predicted_labels):
    cnf_matrix = confusion_matrix(test_labels, predicted_labels)
    #print (cnf_matrix)
    FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)
    FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
    TP = np.diag(cnf_matrix)
    TN = cnf_matrix.sum() - (FP + FN + TP)

    FP = FP.astype(float)
    FN = FN.astype(float)
    TP = TP.astype(float)
    TN = TN.astype(float)

    TPR = TP/(TP+FN)
    FPR = FP/(FP+TN)

    fpr, tpr, _ = roc_curve(test_labels, predicted_labels, pos_label=1)
    aire = metrics.auc(fpr, tpr)
    return (TPR, FPR, aire)

In [None]:
str_TPR="True Positive Rate: "
str_FPR="False Positive Rate: "
str_Aire="l'aire sous la courbe ROC: "

In [None]:
# Arbre de décision: entrainement de l'arbre et prédiction sur les données de test
# Et determination de la F-mesure, du FPR, TPR et de l'aire sous la courbe
ad_classifier = DecisionTreeClassifier(random_state=42)
d_tree1 = ad_classifier.fit(train_features,train_labels)
predicted_labels = d_tree1.predict(test_features)

tpr, fpr, ad_auc  =TPR_FPR(test_labels, predicted_labels)
Val_ad_tpr=tpr[1]
Val_ad_fpr=fpr[1]
f1 = f1_score(test_labels, predicted_labels, average='binary', pos_label=1)
print (str_TPR, Val_ad_tpr)
print (str_FPR, Val_ad_fpr)
print (str_Aire, ad_auc)
f1 = f1_score(test_labels, predicted_labels, average='binary', pos_label=1)
print("F1-score of decision tree: ", f1)
# True Positive Rate:  0.9068156424581005
# False Positive Rate:  0.10431372549019607
# l'aire sous la courbe ROC:  0.9012509584839523
# F1-score of decision tree: 0.9086430810568741

In [None]:
# Random forrest: entrainement et prédiction sur les données de test
# Et determination de la F-mesure, du FPR, TPR et de l'aire sous la courbe
rnd_forest  = RandomForestClassifier()
rnd_forest.fit(train_features,train_labels)
predicted_labels = rnd_forest.predict(test_features)

tpr, fpr, rf_auc  =TPR_FPR(test_labels, predicted_labels)
Val_rf_tpr=tpr[1]
Val_rf_fpr=fpr[1]
f1 = f1_score(test_labels, predicted_labels, average='binary', pos_label=1)
print (str_TPR, Val_rf_tpr)
print (str_FPR, Val_rf_fpr)
print (str_Aire, rf_auc)
f1 = f1_score(test_labels, predicted_labels, average='binary', pos_label=1)
print("F1-score of random forrest: ", f1)
# True Positive Rate:  0.9494972067039106
# False Positive Rate:  0.07581699346405228
# l'aire sous la courbe ROC:  0.9368401066199291
# F1-score of random forrest: 0.9427273734352498

In [None]:
# Bagging: entrainement et prédiction sur les données de test
# Et determination de la F-mesure, du FPR, TPR et de l'aire sous la courbe
bagging = BaggingClassifier()
bagging_classifier = bagging.fit(train_features,train_labels)
predicted_labels = bagging_classifier.predict(test_features)

tpr, fpr, bag_auc  =TPR_FPR(test_labels, predicted_labels)
Val_bag_tpr=tpr[1]
Val_bag_fpr=fpr[1]
print (str_TPR,Val_bag_tpr)
print (str_FPR,Val_bag_fpr)
print (str_Aire,bag_auc)
f1 = f1_score(test_labels, predicted_labels, average='binary', pos_label=1)
print("F1-score of bagging: ", f1)
# True Positive Rate:  0.9356424581005587
# False Positive Rate:  0.07111111111111111
# l'aire sous la courbe ROC:  0.9322656734947238
# F1-score of bagging: 0.9343603482920295

In [None]:
# AdaBoost: entrainement et prédiction sur les données de test
# Et determination de la F-mesure, du FPR, TPR et de l'aire sous la courbe
adaB = AdaBoostClassifier()
adaB_classifier = adaB.fit(train_features,train_labels)
predicted_labels = adaB_classifier.predict(test_features)

tpr, fpr, adaB_auc  =TPR_FPR(test_labels, predicted_labels)
Val_adaB_tpr=tpr[1]
Val_adaB_fpr=fpr[1]
print (str_TPR,Val_adaB_tpr)
print (str_FPR,Val_adaB_fpr)
print (str_Aire,adaB_auc)
f1 = f1_score(test_labels, predicted_labels, average='binary', pos_label=1)
print("F1-score of adaBoost: ", f1)
# True Positive Rate:  0.9325139664804469
# False Positive Rate:  0.07477124183006537
# l'aire sous la courbe ROC:  0.9288713623251909
# F1-score of adaBoost: 0.9341840161182

In [None]:
# Classification avec Naive Bayes: entrainement et prédiction sur les données de test
# Et determination de la F-mesure, du FPR, TPR et de l'aire sous la courbe
bayes_class=GaussianNB()
bayes_classifier = bayes_class.fit(train_features,train_labels)
predicted_labels = bayes_classifier.predict(test_features)

tpr, fpr, nb_auc  =TPR_FPR(test_labels, predicted_labels)
Val_nb_tpr=tpr[1]
Val_nb_fpr=fpr[1]
print (str_TPR,Val_nb_tpr)
print (str_FPR,Val_nb_fpr)
print (str_Aire,nb_auc)
f1 = f1_score(test_labels, predicted_labels, average='binary', pos_label=1)
print("F1-score of Naive bayes: ", f1)
# True Positive Rate:  0.7707262569832403
# False Positive Rate:  0.13333333333333333
# l'aire sous la courbe ROC:  0.8186964618249535
# F1-score of Naive bayes: 0.8178800094854162