In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn import neighbors
from imblearn.over_sampling import SMOTENC
from sklearn import preprocessing
from sklearn.model_selection import cross_val_predict
import warnings
warnings.filterwarnings('ignore')
import io
import plotly.offline as py #visualization
py.init_notebook_mode(connected=True) #visualization
import plotly.graph_objs as go #visualization
import plotly.tools as tls #visualization
import plotly.figure_factory as ff #visualization
from IPython.core.display import HTML
HTML("""
<style>
.output_png {
    display: table-cell;
    text-align: center;
    vertical-align: middle;
}
</style>
""")

# Lecture des datasets

In [None]:
VisaPremier = pd.read_csv(r"./Datasets/VisaPremier.txt", sep="\t")
#VisaPremier.head()
Dataset=VisaPremier

# Dataset overview

In [None]:
Dataset=VisaPremier
print ("Rows     : ", Dataset.shape[0])
print ("Columns  : ", Dataset.shape[1])
print ("\nFeatures : \n", Dataset.columns.tolist())
print ("\nMissing values :  ", Dataset.isnull().sum().values.sum())
print ("\nUnique values :  \n", Dataset.nunique())
print ("\nTypes values :  \n", Dataset.dtypes)

### Head du Dataset

In [None]:
Dataset.iloc[0:5, 0:16]

In [None]:
Dataset.iloc[0:5, 16:31]

In [None]:
Dataset.iloc[0:5, 31:]

# Data preprocessing

In [None]:
Dataset["cartevpr"] = Dataset["cartevpr"].replace({1:"Yes",0:"No"})
Dataset = Dataset.drop(['sexe'], axis=1)
Dataset = Dataset.drop(['cartevp'], axis=1)

#Drop column that contain all values 0
Dataset = Dataset.drop(['nbimpaye'], axis=1)

#Correct wrong values
Dataset["nbpaiecb"] = pd.to_numeric(Dataset.nbpaiecb, errors='coerce').fillna(0)
Dataset["agemvt"] = pd.to_numeric(Dataset.agemvt, errors='coerce').fillna(0)
Dataset["departem"] = pd.to_numeric(Dataset.agemvt, errors='coerce').fillna(0)

#Age to categorical column
def age_lab(data) :
    
    if data["age"] <= 24 :
        return "Age_15-24"
    elif (data["age"] >= 25) & (data["age"] < 35):
        return "Age_25-34"
    elif (data["age"] >= 35) & (data["age"] < 45) :
        return "Age_35-44"
    elif (data["age"] >= 45) & (data["age"] < 55) :
        return "Age_45-54"
    elif data["age"] >= 55 :
        return "Age_gt_55"

SauvAge = Dataset["age"].copy()
Dataset["age"] = Dataset.apply(lambda Dataset:age_lab(Dataset), axis = 1)

#Separating possCarte and non possCarte clients
possCarte = Dataset[Dataset["cartevpr"] == "Yes"]
NonPossCarte = Dataset[Dataset["cartevpr"] == "No"]

#Separating catagorical and numerical columns
Id_col     = ['matricul']
target_col = ["cartevpr"]

cat_cols   = ["ptvente", "age", "sexer", "sitfamil", "csp", "codeqlt"]
num_cols   = ["departem", "anciente", "mtrejet", "nbopguic", "moycred3", "aveparmo", "endette", "engagemt", "engagemc",
              "engagemm", "nbcptvue", "moysold3", "moycredi", "agemvt", "nbop", "mtfactur", "engageml", "nbvie", "mtvie",
              "nbeparmo", "mteparmo", "nbeparlo", "mteparlo", "nblivret", "mtlivret", "nbeparlt", "mteparlt", "nbeparte", 
              "mteparte", "nbbon", "mtbon", "nbpaiecb", "nbcb", "nbcbptar", "avtscpte", "aveparfi", "nbjdebit"]

# Analyse univariée

In [None]:
plt.figure(figsize=(12, 8))

df = pd.DataFrame(Dataset["cartevpr"].replace({1:"Yes",0:"No"}).value_counts())
ax = df.plot.bar(y='cartevpr', rot=0, color=[(0.8, 0.4, 0.2, 0.6), (0.2, 0.4, 0.6, 0.6)])
plt.title("Figure 31 : Répartition des classes")
plt.xlabel("Possesssion carte Visa Premier")
plt.ylabel("count")
plt.legend("")

rects = ax.patches
for r in rects:
    height = r.get_height()
    ax.text(r.get_x() + r.get_width() / 2, 
            height - height/2, str(round(height/Dataset.shape[0]*100, 2))+"%", 
            ha='center', va='bottom')

plt.savefig(r'./Graphs/VISA/VISA_ScoresVarK.png')
plt.show()

# Analyse bivariée

### Posession de la carte en fonction des variables qualitatives

In [None]:
def plot_pie(column) :
    
    trace1 = go.Pie(values  = possCarte[column].value_counts().values.tolist(),
                    labels  = possCarte[column].value_counts().keys().tolist(),
                    hoverinfo = "label+percent+name",
                    domain  = dict(x = [0,.48]),
                    name    = "Clients possCarte",
                    marker  = dict(line = dict(width = 2,
                                               color = "rgb(243,243,243)")
                                  ),
                    hole    = .6
                   )
    trace2 = go.Pie(values  = NonPossCarte[column].value_counts().values.tolist(),
                    labels  = NonPossCarte[column].value_counts().keys().tolist(),
                    hoverinfo = "label+percent+name",
                    marker  = dict(line = dict(width = 2,
                                               color = "rgb(243,243,243)")
                                  ),
                    domain  = dict(x = [.52,1]),
                    hole    = .6,
                    name    = "Clients NonPossCarte" 
                   )


    layout = go.Layout(dict(title = "Répartition du " + column,
                            plot_bgcolor  = "rgb(243,243,243)",
                            paper_bgcolor = "rgb(243,243,243)",
                            title_x=0.5,
                            annotations = [dict(text = "Clients PossCarte",
                                                font = dict(size = 13),
                                                showarrow = False,
                                                x = .15, y = .5),
                                           dict(text = "Clients NonPossCarte",
                                                font = dict(size = 13),
                                                showarrow = False,
                                                x = .88,y = .5
                                               )
                                          ]
                           )
                      )
    data = [trace1,trace2]
    fig  = go.Figure(data = data,layout = layout)
    py.iplot(fig)
# -------------------

#for all categorical columns plot pie
for i in cat_cols :
    plot_pie(i)

### Posession de la carte en fonction des variables quantitatives

In [None]:
def histogram(column) :
    trace1 = go.Histogram(x  = possCarte[column],
                          histnorm= "percent",
                          name = "Clients PossCarte",
                          marker = dict(line = dict(width = .5,
                                                    color = "black"
                                                    )
                                        ),
                         opacity = .9 
                         ) 
    
    trace2 = go.Histogram(x  = NonPossCarte[column],
                          histnorm = "percent",
                          name = "Clients NonPossCarte",
                          marker = dict(line = dict(width = .5,
                                              color = "black"
                                             )
                                 ),
                          opacity = .9
                         )
    
    data = [trace1,trace2]
    layout = go.Layout(dict(title ="Répartition du " + column,
                            plot_bgcolor  = "rgb(243,243,243)",
                            paper_bgcolor = "rgb(243,243,243)",
                            title_x=0.5,
                            xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                             title = column,
                                             zerolinewidth=1,
                                             ticklen=5,
                                             gridwidth=2
                                            ),
                            yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                             title = "percent",
                                             zerolinewidth=1,
                                             ticklen=5,
                                             gridwidth=2
                                            ),
                           )
                      )
    fig  = go.Figure(data=data,layout=layout)
    
    py.iplot(fig)
    
#for all categorical columns plot histogram    
for i in num_cols :
    histogram(i)

### Corrélation

In [None]:
corrMatrix = Dataset[num_cols].iloc[:,1:].corr()
trace1 = go.Heatmap( 
                    x = num_cols[1:],
                    y = num_cols[1:],
                    z = corrMatrix.values.tolist()
                     ) 

data = [trace1]
layout = go.Layout(dict(title ="Figure 32 : Heatmap sur les variables quantitatives",
                        plot_bgcolor  = "rgb(255, 255, 255)",
                        paper_bgcolor = "rgb(255, 255, 255)",
                        title_x=0.5,
                        title_y=0.86,
                        title_font_color="rgb(0, 0, 0)",
                        xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                         zerolinewidth=1,
                                         ticklen=5,
                                         gridwidth=2
                                        ),
                        yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                         zerolinewidth=1,
                                         ticklen=5,
                                         gridwidth=2
                                        ),
                       )
                  )
fig  = go.Figure(data=data,layout=layout)

py.iplot(fig)


### Drop columns with high correlation

In [None]:
# Select upper triangle of correlation matrix
upper = corrMatrix.where(np.triu(np.ones(corrMatrix.shape), k=1).astype(np.bool))

# Find features with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

# Drop features 
oldData = Dataset.copy()
Dataset.drop(to_drop, axis=1, inplace=True)
for col in to_drop: num_cols.remove(col)

# Show columns
colHighCorr = to_drop
colHighCorr.append("moycred3")
colHighCorr.append("mtvie")
oldData[to_drop].corr()

# Partitionnement en X y

In [None]:
Dataset = Dataset.drop(['matricul'], axis=1)
Dataset["cartevpr"] = Dataset["cartevpr"].replace({"Yes":1,"No":0})
Dataset["age"] = SauvAge.copy()

X = Dataset.loc[:, Dataset.columns != "cartevpr"]
Y = Dataset[["cartevpr"]]

# Encodage des données catégorielles

In [None]:
le = preprocessing.LabelEncoder()
X['ptvente'] = le.fit_transform(X['ptvente'])
X['sitfamil'] = le.fit_transform(X['sitfamil'])
X['csp'] = le.fit_transform(X['csp'])
X['codeqlt'] = le.fit_transform(X['codeqlt'])

# Normalisation

In [None]:
num_cols.remove('departem')
cat_cols.append('departem')
cat_cols.remove('age')
num_cols.append('age')

X_num = X[num_cols]

scaler = StandardScaler().fit(X_num)
rescaledX_num = scaler.transform(X_num)
rescaledX_num = pd.DataFrame(rescaledX_num, columns=num_cols)
rescaledX = pd.concat([X[cat_cols], rescaledX_num], axis=1)

# Equilibrage des classes

In [None]:
cat_features=[]

for i in cat_cols: cat_features.append(rescaledX.columns.get_loc(i))

smote = SMOTENC(sampling_strategy = 'minority', categorical_features = cat_features)
X_sm, Y_sm = smote.fit_sample(rescaledX, Y)

In [None]:
fig = plt.figure(figsize=(7,7))
df = pd.DataFrame(Y_sm["cartevpr"].replace({1:"Yes",0:"No"}).value_counts())
ax = df.plot.bar(y='cartevpr', rot=0, color=[(0.8, 0.4, 0.2, 0.6), (0.2, 0.4, 0.6, 0.6)])
plt.title("Figure 33 : Nouvelle répartition des classes")
plt.xlabel("Possesssion carte Visa Premier")
plt.ylabel("count")
plt.legend('')

rects = ax.patches
for r in rects:
    height = r.get_height()
    ax.text(r.get_x() + r.get_width() / 2, 
            height - height/2, str(round(height/Y_sm.shape[0]*100, 2))+"%", 
            ha='center', va='bottom')
    
plt.savefig(r'./Graphs/VISA/VISA_rep50.png')
plt.show()

# Application des algorithmes de classification

### Application du KNN en variant la valeur K

In [None]:
n_neighbors = 1
clf_KNN = neighbors.KNeighborsClassifier(n_neighbors, weights="distance")
kfold = KFold(n_splits=10)
Y_pred_KNN = cross_val_predict(clf_KNN, X_sm, Y_sm, cv=kfold)

print("- Accuracy score \n" + str(accuracy_score(Y_sm, Y_pred_KNN)))
print("\n- Confusion matrix \n " + str(confusion_matrix(Y_sm, Y_pred_KNN)))
print("\n- Classification report \n " + str(classification_report(Y_sm, Y_pred_KNN)))

### Application du Naïve bayésienne

In [None]:
clf_BN = GaussianNB()
kfold = KFold(n_splits=10)
Y_pred_BN = cross_val_predict(clf_BN, X_sm, Y_sm, cv=kfold)

print("- Accuracy score \n" + str(accuracy_score(Y_sm, Y_pred_BN)))
print("\n- Confusion matrix \n " + str(confusion_matrix(Y_sm, Y_pred_BN)))
print("\n- Classification report \n " + str(classification_report(Y_sm, Y_pred_BN)))

### Application de CART

In [None]:
clf_CART = DecisionTreeClassifier()
kfold = KFold(n_splits=10)
Y_pred_CART = cross_val_predict(clf_CART, X_sm, Y_sm, cv=kfold)

print("- Accuracy score \n" + str(accuracy_score(Y_sm, Y_pred_CART)))
print("\n- Confusion matrix \n " + str(confusion_matrix(Y_sm, Y_pred_CART)))
print("\n- Classification report \n " + str(classification_report(Y_sm, Y_pred_CART)))

### Application du RF

In [None]:
clf_RF = RandomForestClassifier()
kfold = KFold(n_splits=10)
Y_pred_RF = cross_val_predict(clf_RF, X_sm, Y_sm, cv=kfold)

print("- Accuracy score \n" + str(accuracy_score(Y_sm, Y_pred_RF)))
print("\n- Confusion matrix \n " + str(confusion_matrix(Y_sm, Y_pred_RF)))
print("\n- Classification report \n " + str(classification_report(Y_sm, Y_pred_RF)))

### Application du LDA

In [None]:
clf_LDA = LinearDiscriminantAnalysis()
kfold = KFold(n_splits=10)
Y_pred_LDA = cross_val_predict(clf_LDA, X_sm, Y_sm, cv=kfold)

print("- Accuracy score \n" + str(accuracy_score(Y_sm, Y_pred_LDA)))
print("\n- Confusion matrix \n " + str(confusion_matrix(Y_sm, Y_pred_LDA)))
print("\n- Classification report \n " + str(classification_report(Y_sm, Y_pred_LDA)))

### Comparaison des algorithmes de classification

#### Accuracy

In [None]:
num_folds = 10
scoring = 'accuracy'

models = []
models.append(('KNN', KNeighborsClassifier()))
models.append(('NB', GaussianNB()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('RF', RandomForestClassifier()))
models.append(('LDA', LinearDiscriminantAnalysis()))
results = []
names = []

for name, model in models:
	kfold = KFold(n_splits=num_folds)
	cv_results = cross_val_score(model, X_sm, Y_sm, cv=kfold, scoring=scoring)
	results.append(cv_results)
	names.append(name)
	msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
	print(msg)


In [None]:
results_df = pd.DataFrame(results).transpose()
results_df.columns = names
results_df

In [None]:
plt.figure(figsize=(15,8))
bplot = sns.boxplot(data=results_df, width=0.5)
bplot.set_title('Figure 34 : Accuracy Algorithm Comparison')
plt.ylabel("Accuracy")
plt.savefig(r'./Graphs/VISA/VISA_Accuracy.png')

#### F-measure

In [None]:
num_folds = 10
scoring = 'f1_weighted'

models = []
models.append(('KNN', KNeighborsClassifier()))
models.append(('NB', GaussianNB()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('RF', RandomForestClassifier()))
models.append(('LDA', LinearDiscriminantAnalysis()))
results = []
names = []

for name, model in models:
	kfold = KFold(n_splits=num_folds)
	cv_results = cross_val_score(model, X_sm, Y_sm, cv=kfold, scoring=scoring)
	results.append(cv_results)
	names.append(name)
	msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
	print(msg)

In [None]:
results_df = pd.DataFrame(results).transpose()
results_df.columns = names
results_df

In [None]:
plt.figure(figsize=(15,8))
bplot = sns.boxplot(data=results_df, width=0.5)
bplot.set_title('Figure 35 : F-measure Algorithm Comparison')
plt.ylabel("F-measure")
plt.savefig(r'./Graphs/VISA/VISA_Fmeasure.png')