In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn import neighbors
from imblearn.over_sampling import SMOTENC
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.under_sampling import ClusterCentroids
from sklearn import preprocessing
from sklearn.model_selection import cross_val_predict
import warnings
warnings.filterwarnings('ignore')
import io
import plotly.offline as py #visualization
py.init_notebook_mode(connected=True) #visualization
import plotly.graph_objs as go #visualization
import plotly.tools as tls #visualization
import plotly.figure_factory as ff #visualization
import seaborn as sns
from IPython.core.display import HTML
HTML("""
<style>
.output_png {
    display: table-cell;
    text-align: center;
    vertical-align: middle;
}
</style>
""")

# Lecture des datasets

In [None]:
creditcard = pd.read_csv(r"./Datasets/creditcard.csv")
Dataset=creditcard
creditcard.head()

# Dataset overview

In [None]:
print ("Rows     : ", Dataset.shape[0])
print ("Columns  : ", Dataset.shape[1])
print ("\nFeatures : \n", Dataset.columns.tolist())
print ("\nMissing values :  ", Dataset.isnull().sum().values.sum())
print ("\nUnique values :  \n", Dataset[["Time"]].nunique())
print ("\nTypes values :  \n", Dataset.dtypes)

### Head du Dataset

In [None]:
Dataset.iloc[0:5, 0:16]

In [None]:
Dataset.iloc[0:5, 16:31]

# Data preprocessing

In [None]:
Dataset["Class"] = Dataset["Class"].replace({1:"Yes",0:"No"})

#Amount to categorical column
def amount_lab(data) :
    
    if data["Amount"] <= 51 :
        return "Amount_0-50"
    elif (data["Amount"] >= 51) & (data["Amount"] < 101):
        return "Amount_51-100"
    elif (data["Amount"] >= 101) & (data["Amount"] < 501) :
        return "Amount_101-500"
    elif (data["Amount"] >= 501) & (data["Amount"] < 1001) :
        return "Amount_501-1000"
    elif (data["Amount"] >= 1001) & (data["Amount"] < 1501) :
        return "Amount_1001-1500"
    elif (data["Amount"] >= 1501) & (data["Amount"] < 2001) :
        return "Amount_1501-2000"
    elif (data["Amount"] >= 2001) & (data["Amount"] < 2501) :
        return "Amount_2001-2500"
    elif (data["Amount"] >= 2501) & (data["Amount"] < 3001) :
        return "Amount_2501-3000"
    elif (data["Amount"] >= 3001) & (data["Amount"] < 3501) :
        return "Amount_3001-3500"
    elif data["Amount"] >= 3501 :
        return "Amount_gt_3500"
    
SauvAmount = Dataset["Amount"].copy()
Dataset["Amount"] = Dataset.apply(lambda Dataset:amount_lab(Dataset), axis = 1)

#Separating churn and non churn customers
fraud = Dataset[Dataset["Class"] == "Yes"]
NonFraud = Dataset[Dataset["Class"] == "No"]

#Separating catagorical and numerical columns
target_col = ["Class"]
cat_cols = ["Amount"]
num_cols = ["Time"]

# Analyse univariée

In [None]:
plt.figure(figsize=(12, 8))

df = pd.DataFrame(Dataset["Class"].replace({1:"Yes",0:"No"}).value_counts())
ax = df.plot.bar(y='Class', rot=0, color=[(0.8, 0.4, 0.2, 0.6), (0.2, 0.4, 0.6, 0.6)])
plt.title("Figure 36 : Répartition des classes")
plt.xlabel("Possesssion carte Visa Premier")
plt.ylabel("count")
plt.legend("")

rects = ax.patches
for r in rects:
    height = r.get_height()
    ax.text(r.get_x() + r.get_width() / 2, 
            height - height/2, str(round(height/Dataset.shape[0]*100, 2))+"%", 
            ha='center', va='bottom')

plt.savefig(r'./Graphs/CartCred/CartCred_repClasses.png')
plt.show()

# Analyse bivariée

### Fraudes en fonction des variables qualitatives

In [None]:
def plot_pie(column) :
    
    trace1 = go.Pie(values  = fraud[column].value_counts().values.tolist(),
                    labels  = fraud[column].value_counts().keys().tolist(),
                    hoverinfo = "label+percent+name",
                    domain  = dict(x = [0,.48]),
                    name    = "Fraud",
                    marker  = dict(line = dict(width = 2,
                                               color = "rgb(243,243,243)")
                                  ),
                    hole    = .6
                   )
    trace2 = go.Pie(values  = NonFraud[column].value_counts().values.tolist(),
                    labels  = NonFraud[column].value_counts().keys().tolist(),
                    hoverinfo = "label+percent+name",
                    marker  = dict(line = dict(width = 2,
                                               color = "rgb(243,243,243)")
                                  ),
                    domain  = dict(x = [.52,1]),
                    hole    = .6,
                    name    = "NonFraud" 
                   )


    layout = go.Layout(dict(title = "Répartition du " + column,
                            plot_bgcolor  = "rgb(243,243,243)",
                            paper_bgcolor = "rgb(243,243,243)",
                            title_x=0.5,
                            annotations = [dict(text = "Fraud",
                                                font = dict(size = 13),
                                                showarrow = False,
                                                x = .15, y = .5),
                                           dict(text = "NonFraud",
                                                font = dict(size = 13),
                                                showarrow = False,
                                                x = .88,y = .5
                                               )
                                          ]
                           )
                      )
    data = [trace1,trace2]
    fig  = go.Figure(data = data,layout = layout)
    py.iplot(fig)
# -------------------

#for all categorical columns plot pie
for i in cat_cols :
    plot_pie(i)

### Corrélation

In [None]:
Dataset["Amount"] = SauvAmount.copy()
corrMatrix = Dataset.iloc[:,:30].corr()
trace1 = go.Heatmap( 
                    x = Dataset.iloc[:,:30].columns.tolist(),
                    y = Dataset.iloc[:,:30].columns.tolist(),
                    z = corrMatrix.values.tolist()
                     ) 

data = [trace1]
layout = go.Layout(dict(title ="Figure 37 : Heatmap sur les variables quantitatives",
                        plot_bgcolor  = "rgb(255,255,255)",
                        paper_bgcolor = "rgb(255,255,255)",
                        title_x=0.5,
                        title_y=0.86,
                        title_font_color="rgb(0, 0, 0)",
                        xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                         zerolinewidth=1,
                                         ticklen=5,
                                         gridwidth=2
                                        ),
                        yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                         zerolinewidth=1,
                                         ticklen=5,
                                         gridwidth=2
                                        ),
                       )
                  )
fig  = go.Figure(data=data,layout=layout)

py.iplot(fig)


# Partitionnement du Dataset

In [None]:
Dataset["Class"] = Dataset["Class"].replace({"Yes":1,"No":0})
Dataset["Amount"] = SauvAmount.copy()

X = Dataset.loc[:, Dataset.columns != "Class"]
Y = Dataset[["Class"]]

# Normalisation

In [None]:
X = X.drop(['Time'], axis=1)
X_num = X[["Amount"]].copy()
rescaledX = X.drop(['Amount'], axis=1)

scaler = StandardScaler().fit(X_num)
rescaledX_num = scaler.transform(X_num)
rescaledX_num = pd.DataFrame(rescaledX_num, columns=["Amount"])
rescaledX = pd.concat([rescaledX, rescaledX_num], axis=1)

X = rescaledX.copy()

# Equilibrage des classes

In [None]:
overUnderSampling = SMOTEENN()
X_nm, y_nm = overUnderSampling.fit_resample(X, Y)

In [None]:
fig = plt.figure(figsize=(7,7))
df = pd.DataFrame(y_nm["Class"].replace({1:"Yes",0:"No"}).value_counts())
ax = df.plot.bar(y='Class', rot=0, color=[(0.8, 0.4, 0.2, 0.6), (0.2, 0.4, 0.6, 0.6)])
plt.title("Figure 38 : Nouvelle répartition des classes")
plt.xlabel("Fraud")
plt.ylabel("count")
plt.legend('')

rects = ax.patches
for r in rects:
    height = r.get_height()
    ax.text(r.get_x() + r.get_width() / 2, 
            height - height/2, str(round(height/y_nm.shape[0]*100, 2))+"%", 
            ha='center', va='bottom')
    
plt.savefig(r'./Graphs/CartCred/CartCred_rep50.png')
plt.show()

In [None]:
X_nm = pd.DataFrame(X_nm,columns=X.columns.to_list())
y_nm = pd.DataFrame(y_nm,columns=['Class'])

# Partitionnement

In [None]:
seed = 50
X_train, X_validation, Y_train, Y_validation = train_test_split(X_nm, y_nm, 
                                                                test_size=0.3, random_state=seed)
X_train.reset_index(drop=True, inplace=True)
Y_train.reset_index(drop=True, inplace=True)
X_validation.reset_index(drop=True, inplace=True)
Y_validation.reset_index(drop=True, inplace=True)

# Application des algorithmes de classification

### Application du KNN

In [None]:
n_neighbors = 1

clf_KNN = neighbors.KNeighborsClassifier(n_neighbors, weights="distance")
clf_KNN.fit(X_train,Y_train)
Y_pred_KNN = clf_KNN.predict(X_validation)

print("- Accuracy score \n" + str(accuracy_score(Y_validation, Y_pred_KNN)))
print("\n- Confusion matrix \n " + str(confusion_matrix(Y_validation, Y_pred_KNN)))
print("\n- Classification report \n " + str(classification_report(Y_validation, Y_pred_KNN)))

### Application du Naïve bayésienne

In [None]:
clf_BN = GaussianNB()
clf_BN.fit(X_train,Y_train)
Y_pred_BN = clf_BN.predict(X_validation)

print("- Accuracy score \n" + str(accuracy_score(Y_validation, Y_pred_BN)))
print("\n- Confusion matrix \n " + str(confusion_matrix(Y_validation, Y_pred_BN)))
print("\n- Classification report \n " + str(classification_report(Y_validation, Y_pred_BN)))

### Application de CART

In [None]:
clf_CART = DecisionTreeClassifier()
clf_CART.fit(X_train,Y_train)
Y_pred_CART = clf_CART.predict(X_validation)

print("- Accuracy score \n" + str(accuracy_score(Y_validation, Y_pred_CART)))
print("\n- Confusion matrix \n " + str(confusion_matrix(Y_validation, Y_pred_CART)))
print("\n- Classification report \n " + str(classification_report(Y_validation, Y_pred_CART)))

### Application du RF

In [None]:
clf_RF = RandomForestClassifier()
clf_RF.fit(X_train,Y_train)
Y_pred_RF = clf_RF.predict(X_validation)

print("- Accuracy score \n" + str(accuracy_score(Y_validation, Y_pred_RF)))
print("\n- Confusion matrix \n " + str(confusion_matrix(Y_validation, Y_pred_RF)))
print("\n- Classification report \n " + str(classification_report(Y_validation, Y_pred_RF)))

### Application du LDA

In [None]:
clf_LDA = LinearDiscriminantAnalysis()
clf_LDA.fit(X_train,Y_train)
Y_pred_LDA = clf_LDA.predict(X_validation)

print("- Accuracy score \n" + str(accuracy_score(Y_validation, Y_pred_LDA)))
print("\n- Confusion matrix \n " + str(confusion_matrix(Y_validation, Y_pred_LDA)))
print("\n- Classification report \n " + str(classification_report(Y_validation, Y_pred_LDA)))

### Comparaison des algorithmes de classification

In [None]:
num_folds = 10
scoring = 'accuracy'

models = []
models.append(('KNN', KNeighborsClassifier()))
models.append(('NB', GaussianNB()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('RF', RandomForestClassifier()))
models.append(('LDA', LinearDiscriminantAnalysis()))
results = []
names = []

for name, model in models:
	kfold = KFold(n_splits=num_folds)
	cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
	results.append(cv_results)
	names.append(name)
	msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
	print(msg)


In [None]:
results_df = pd.DataFrame(results).transpose()
results_df.columns = names
results_df

In [None]:
plt.figure(figsize=(15,8))
bplot = sns.boxplot(data=results_df, width=0.5)
bplot.set_title('Figure 39 : Comparison des algorithmes en terme d\'Accuracy')
plt.ylabel('Accuracy')
plt.savefig(r'./Graphs/CartCred/CartCred_AccuracyCompAlgos.png')

In [None]:
methods = [clf_KNN, clf_BN, clf_CART, clf_RF]
methodsName = ['KNN', 'BN', 'CART', 'RF']

for clf, name in zip(methods,methodsName):
    #clf = clf.fit(X_sm, Y_sm)
    Y_pred = clf.predict_proba(X_validation)
    Y_pred = Y_pred[:, 1]
    lr_precision, lr_recall, _ = precision_recall_curve(Y_validation, Y_pred)
    plt.plot(lr_recall, lr_precision, marker='.', label=name)
    plt.legend()

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Figure 40 : AUPRC Comparaison')
plt.savefig(r'./Graphs/CartCred/CartCred_PrRcCompAlgos.png')
plt.show()