In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from matplotlib.colors import ListedColormap
from sklearn.model_selection import cross_val_predict
from sklearn import neighbors
import warnings
warnings.filterwarnings('ignore')
from plot_boundaries import plot_boundaries
from IPython.core.display import HTML
HTML("""
<style>
.output_png {
    display: table-cell;
    text-align: center;
    vertical-align: middle;
}
</style>
""")


# Lecture des datasets

In [None]:
flame = pd.read_csv(r"./Datasets/flame.txt",
                    sep="\t",
                    names=["V1", "V2", "class"])

In [None]:
creditcard = pd.read_csv(r"Enonce\Datasets_projet_MLDS_M1\creditcard.csv")
#creditcard.head()

In [None]:
Dataset=flame
print("Les dimensions du dataset")
print(str(Dataset.shape)+"\n--------------------------")
print("Les types des variables du dataset")
print(str(Dataset.dtypes)+"\n-------------------------")
print("Head du dataset")
print(Dataset.head())

## Analyse univariée

In [None]:
Dataset[["V1","V2"]].describe()

In [None]:
print ('Nombre d\'attributs de class 1 : ' + str(Dataset['class'][Dataset['class']==1].count()))
print ('Nombre d\'attributs de class 2 : ' + str(Dataset['class'][Dataset['class']==2].count()))

In [None]:
Dataset.groupby("class").mean()

In [None]:
fig = plt.figure(figsize=(12,6))
ax = sns.histplot(data=Dataset[["V1","V2"]])
ax.set_title('Flame Histogramme')
fig.savefig(r'./Graphs/Flame/Flame_Histogramme.png')

In [None]:
fig1 = plt.figure(figsize=(12,6))
sns.displot(Dataset[["V1","V2"]], kind="kde", fill=True)
plt.title('Figure 1 : Flame - Graphe de densité')
fig1.savefig(r'./Graphs/Flame/Flame_Density.png')

In [None]:
fig = plt.figure(figsize=(6,6))
ax = sns.boxplot(data=Dataset[["V1","V2"]])
ax.set_title('Figure 1 : Flame - Box Plot')
fig.savefig(r'./Graphs/Flame/Flame_BoxPlot.png')

In [None]:
fig1 = plt.figure(0)
f = Dataset.groupby('class').count()
f.plot.pie(y='V1', figsize=(5, 5), explode = (0, 0.1), shadow=True, startangle=90, autopct='%1.1f%%')
plt.title('Figure 2 : Flame - Pie Plot')
plt.ylabel("")
plt.savefig(r'./Graphs/Flame/Flame_PiePlot1.png')

## Analyse bivariée

In [None]:
plt.figure(figsize=(12,6))
groups = Dataset.groupby("class")
for name, group in groups:
    plot = plt.plot(group["V1"], group["V2"], marker="o", linestyle="", label=name)
plt.legend()
plt.xlabel("V1")
plt.ylabel("V2")
plt.title("Figure 3 : Flame - Nuage de points")
plt.savefig(r'./Graphs/Flame/Flame_ScatterPlot.png')

## Partitionnement du Dataset

In [None]:
seed = 50
X_train, X_validation, Y_train, Y_validation = train_test_split(Dataset[["V1","V2"]], Dataset[["class"]],
                                                                test_size=0.3, random_state=seed)
X_train.reset_index(drop=True, inplace=True)
Y_train.reset_index(drop=True, inplace=True)
X_validation.reset_index(drop=True, inplace=True)
Y_validation.reset_index(drop=True, inplace=True)

## Application des algorithmes de classification

### Application du KNN en variant la valeur K

In [None]:
num_folds = 10
scoring = 'accuracy'
voisins = [i for i in range(1,21)]
param_grid = dict(n_neighbors=voisins)
model = KNeighborsClassifier()
kfold = KFold(n_splits=num_folds)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(X_train, Y_train)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
p = [param.get("n_neighbors", "") for param in params]
l = [p, means]
df = pd.DataFrame(l).transpose()
df.columns = ["K", "score moyen du test"]
plt.figure(figsize=(15,8))
sns.lineplot(data=df, x="K", y="score moyen du test")
plt.title("Figure 4 : Scores moyens de précision en variant le K")
plt.savefig(r'./Graphs/Flame/Flame_ScoresVarK.png')

In [None]:
n_neighbors = 1

flameTrainList = pd.concat([X_train, Y_train.reindex(X_train.index)], axis=1).to_numpy()
X = flameTrainList[:, :2]
y = flameTrainList[:, 2:]

clf_knn = neighbors.KNeighborsClassifier(n_neighbors, weights="distance")
clf_knn.fit(X, y)

plot_boundaries(X, y, clf_knn, "Figure 5 : Frontières de décision du KNN",
                "V1", "V2", r'./Graphs/Flame/Flame_LimitesDicKnn.png')

In [None]:
Y_pred_knn = clf_knn.predict(X_validation)
print("- Accuracy score \n" + str(accuracy_score(y_true=Y_validation, y_pred=Y_pred_knn)))
print("\n- Confusion matrix \n " + str(confusion_matrix(Y_validation, Y_pred_knn)))
print("\n- Classification report \n " + str(classification_report(Y_validation, Y_pred_knn)))

### Application du Naïve bayésienne

In [None]:
clf_BN = GaussianNB()
clf_BN.fit(X_train,Y_train)
Y_pred_BN = clf_BN.predict(X_validation)

print("- Accuracy score \n" + str(accuracy_score(Y_validation, Y_pred_BN)))
print("\n- Confusion matrix \n " + str(confusion_matrix(Y_validation, Y_pred_BN)))
print("\n- Classification report \n " + str(classification_report(Y_validation, Y_pred_BN)))

In [None]:
clf_BN = GaussianNB()
clf_BN.fit(X, y)

plot_boundaries(X, y, clf_BN, "Figure 6 : Frontières de décision du BN",
                "V1", "V2", r'./Graphs/Flame/Flame_LimitesDicBN.png')

### Application de CART

In [None]:
clf_CART = DecisionTreeClassifier()
clf_CART.fit(X_train,Y_train)
Y_pred2 = clf_CART.predict(X_validation)

In [None]:
from sklearn.tree import plot_tree
feature_cols=["V1","V2"]
fig = plt.figure(figsize=(25,20))
plot_tree(clf_CART, feature_names=feature_cols, class_names=["1","2"], filled=True)
plt.savefig(r'./Graphs/Flame/Flame_ArbreCart.png')
plt.title("CART Plot")

In [None]:
clf_CART = DecisionTreeClassifier()
clf_CART.fit(X, y)

plot_boundaries(X, y, clf_CART, "Figure 7 : Frontières de décision de CART",
                "V1", "V2", r'./Graphs/Flame/Flame_LimitesDicCART.png')

In [None]:
print("- Accuracy score \n" + str(accuracy_score(y_true=Y_validation, y_pred=Y_pred2)))
print("\n- Confusion matrix \n " + str(confusion_matrix(Y_validation, Y_pred2)))
print("\n- Classification report \n " + str(classification_report(Y_validation, Y_pred2)))

### Application du RF

In [None]:
clf_RF = RandomForestClassifier()
clf_RF.fit(X, y)

plot_boundaries(X, y, clf_RF, "Figure 8 : Fontières de décision de RF",
                "V1", "V2", r'./Graphs/Flame/Flame_LimitesDicRf.png')

In [None]:
Y_pred = clf_RF.predict(X_validation)
print("- Accuracy score \n" + str(accuracy_score(y_true=Y_validation, y_pred=Y_pred)))
print("\n- Confusion matrix \n " + str(confusion_matrix(Y_validation, Y_pred)))
print("\n- Classification report \n " + str(classification_report(Y_validation, Y_pred)))

### Application du LDA

In [None]:
clf_LDA = LinearDiscriminantAnalysis()
clf_LDA.fit(X, y)

plot_boundaries(X, y, clf_LDA, "Figure 9 : Fontières de décision de LDA",
                "V1", "V2", r'./Graphs/Flame/Flame_LimitesDecLDA.png')

In [None]:
clf_LDA = LinearDiscriminantAnalysis()
clf_LDA.fit(X_train, Y_train)
Y_pred_LDA = clf_LDA.predict(X_validation)

print("- Accuracy score \n" + str(accuracy_score(Y_validation, Y_pred_LDA)))
print("\n- Confusion matrix \n " + str(confusion_matrix(Y_validation, Y_pred_LDA)))
print("\n- Classification report \n " + str(classification_report(Y_validation, Y_pred_LDA)))

### Comparaison des algorithmes de classification

In [None]:
num_folds = 10
scoring = 'accuracy'

models = []
models.append(('KNN', KNeighborsClassifier()))
models.append(('NB', GaussianNB()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('RF', RandomForestClassifier()))
models.append(('LDA', LinearDiscriminantAnalysis()))
results = []
names = []

for name, model in models:
	kfold = KFold(n_splits=num_folds)
	cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
	results.append(cv_results)
	names.append(name)
	msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
	print(msg)


In [None]:
results_df = pd.DataFrame(results).transpose()
results_df.columns = names
results_df

In [None]:
palette = sns.color_palette("Set2")
plt.figure(figsize=(15,8))
bplot = sns.boxplot(data=results_df, width=0.5, palette=palette)
bplot.set_title('Figure 10 : Comparison des algorithmes')
plt.ylabel("Scores de précision par test")
plt.savefig(r'./Graphs/Flame/Flame_compAlgos.png')