# Imports

In [None]:
import pathlib

import collections
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import numpy as np
import heapq
import matplotlib.pyplot as plt
import tensorflow as tf
import pickle
import seaborn as sns



from datetime import date

from anonymizedf.anonymizedf import anonymize
from yaml import load, Loader, dump
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.decomposition import PCA
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
from tensorflow.keras.models import Sequential, load_model
from keras.layers.core import Dense, Activation
from tensorflow.keras.layers import LSTM, Dropout, Embedding, Bidirectional
from tensorflow.keras.optimizers import RMSprop
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score

# Constantes

In [None]:
yaml_file = open("app.yaml", 'r')
yaml_content = load(yaml_file, Loader=Loader)

MODELS_DIR = yaml_content["MODELS_DIR"]


PATH_CLASSES = MODELS_DIR + '/categories.txt'

# Fonctions

In [None]:
def evaluation_metrics(y_pred, y_test):
    """Give some metrics for model evaluation
    
    Parameters:
    -----------
    y_pred (): the model prediction
    y_test (): the output data validation
    
    Returns:
    accuracy, precision and recall
    --------
    """
    y_pred = np.argmax(y_pred, axis=1)
    print(f'accuracy: {accuracy_score(y_pred,y_test)*100:.2f}%')
    print(f'precision: {precision_score(y_pred,y_test, average="macro")*100:.2f}%')
    print(f'recall: {recall_score(y_pred,y_test, average="macro")*100:.2f}%')
    # print(classification_report(y_test, y_pred))
    fig, ax = plt.subplots(figsize=(15,10))
    sns.heatmap(pd.crosstab(df3["Sequence_1"].astype('category').cat.categories[y_test], 
                            df3["Sequence_1"].astype('category').cat.categories[y_pred], 
                            normalize='index'),
                cmap='vlag',
                ax=ax)


In [None]:
def KMeans_Algorithm(vec, n):
    """K-Means clustering

    Args:
        vec (array): pca array
        n (int): number of clusters

    Returns:
        inertia, label, centroids, silhouette_score
    """
    clustering_KMeans = KMeans(n_clusters= n, random_state=234, init='k-means++', max_iter=150, algorithm = "elkan")
    clustering_KMeans.fit(vec)

    centroids  = clustering_KMeans.cluster_centers_
    
    label = clustering_KMeans.labels_
    
    inertia = clustering_KMeans.inertia_
    silhouette_score = metrics.silhouette_score(vec, label)
    
    return inertia, label, centroids, silhouette_score

In [None]:
def plot_history(history):
    """Plot somes fig to see the model progress

    Parameters:
    ----------
    history (keras.callbacks.History) : the model training history

    Returns:
    --------
    """
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    epochs = range(len(acc))
    plt.plot(epochs, acc, 'b', label='Training acc')
    plt.plot(epochs, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.figure()
    plt.plot(epochs, loss, 'b', label='Training loss')
    plt.plot(epochs, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()
    plt.show()

# Chargement base de données

In [None]:
df = pd.read_csv('../../../Base.csv', 
                #  header=0,
                 encoding='UTF-16',
                 na_values=['  ', '   '])
df.head()

## Anonymisation des données

In [None]:
df_an = anonymize(df)
df["Code dossier"] = df_an.fake_ids("Code dossier")["Fake_Code dossier"]
df["Code contrat"] = df_an.fake_ids("Code contrat")["Fake_Code contrat"]
df.drop(columns=["Fake_Code dossier", "Fake_Code contrat"], inplace=True)

In [None]:
df.head(10)

In [None]:
df["Code periodicite de paiement"].unique()

# Analyse exploratoire

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.shape

In [None]:
df["Date de naissance"] = pd.to_datetime(df["Date de naissance"])
df["Date  acte"] = pd.to_datetime(df["Date  acte"])

In [None]:
df["Age"] = df["Date de naissance"].apply(lambda x: date.today().year - x.year - ((date.today().month, date.today().day) < (x.month, x.day)))

In [None]:
for column in df.columns[:7]:
    print(column, np.unique(df[df[column].notna()][column], return_counts=True)[0].shape[0], '\n')

In [None]:
for column in df.columns[7:]:
    print(column, np.unique(df[df[column].notna()][column], return_counts=True)[0].shape[0], '\n')

In [None]:
num_var = []
qual_var = []
for column in df.columns:
    if is_string_dtype(df[column]):
        qual_var.append(column)
    elif is_numeric_dtype(df[column]):
        num_var.append(column)
print("Numerical variables: ", num_var)
print("\nQualitative variables: ", qual_var)

In [None]:
df.describe().round(2)

In [None]:
plt.figure(1,figsize=(8,5))

(df["Age"].value_counts(normalize=True)*100).plot(kind='bar')
plt.title("Age")

In [None]:
plt.figure(1,figsize=(8,5))

df["Age"].plot(kind='hist')
plt.title("Age histogram")

In [None]:
plt.figure(1,figsize=(8,5))

df['Total montant reglement en devise'].plot(kind='hist')
plt.title("Montant histogram")

In [None]:
plt.figure(1,figsize=(8,5))

df['Solde compte client'].plot(kind='hist')
plt.title("Solde histogram")

In [None]:
plt.figure(1,figsize=(8,5))
plt.boxplot(df["Age"])
plt.title("Age boxplot")

In [None]:
plt.figure(1,figsize=(8,5))
plt.boxplot(df['Total montant reglement en devise'])
plt.title("Montant boxplot")

In [None]:
plt.figure(1,figsize=(8,5))
plt.boxplot(df['Solde compte client'])
plt.title("Solde boxplot")

In [None]:
plt.figure(1,figsize=(8,5))

plt.scatter(df.Age, df['Total montant reglement en devise'])
title = "Age vs Total montant reglement"
plt.title(title)

In [None]:
plt.figure(1,figsize=(8,5))

plt.scatter(df.Age, df['Solde compte client'])
title = "Age vs Solde compte client"
plt.title(title)

In [None]:
(df["Ville de l’adhérent"].value_counts(normalize=True)*100).head(10).plot(kind='bar')
title = "Top 10 des villes"
plt.title(title)

In [None]:
(df["Code produit"].value_counts(normalize=True)*100).head(10).plot(kind='bar')
title = "Top 10 des produits"
plt.title(title)

In [None]:
(df["Code prestation"].value_counts(normalize=True)*100).head(10).plot(kind='bar')
title = "Top 10 des prestations"
plt.title(title)

In [None]:
(df["Code sexe"].value_counts(normalize=True)*100).plot(kind='bar')
title = "Sexe"
plt.title(title)


In [None]:
(df["Code periodicite de paiement"].value_counts(normalize=True)*100).head(10).plot(kind='bar')
title = "Periodicite de paiement"
plt.title(title)

In [None]:
(df["Nature du produit"].value_counts(normalize=True)*100).head(10).plot(kind='bar')
title = "Nature du produit"
plt.title(title)

In [None]:
# Nombre de contrat en moyenne par adhérent
df.groupby(["Code dossier", "Code contrat"])["Code contrat"].count().mean()

In [None]:
# Nombre de contrat minimum par adhérent
df.groupby(["Code dossier", "Code contrat"])["Code contrat"].count().min()

In [None]:
# Nombre de contrat maximum par adhérent
df.groupby(["Code dossier", "Code contrat"])["Code contrat"].count().max()

In [None]:
# Nombre de contrat en moyenne par ville
df.groupby(["Ville de l’adhérent", "Code dossier", "Code contrat"])["Code contrat"].count().mean()

In [None]:
# Nombre de contrat minimum par ville
df.groupby(["Ville de l’adhérent", "Code dossier", "Code contrat"])["Code contrat"].count().min()

In [None]:
# Nombre de contrat maximum par ville
df.groupby(["Ville de l’adhérent", "Code dossier", "Code contrat"])["Code contrat"].count().max()

# Preprocessing

In [None]:
df2 = pd.DataFrame(columns=df.columns)

In [None]:
df1 = df.drop_duplicates(subset=["Code dossier", "Code prestation"])

In [None]:
list_dos = [x for x in df1.groupby(["Code dossier"])["Code dossier"].filter(lambda x: len(x) > 4).sample(frac=1, random_state=234).unique()]
df2 = df1[df1["Code dossier"].isin(list_dos)]
df2

In [None]:
df2

In [None]:
df2["Code dossier"].nunique()

In [None]:
df2 = df2.sort_values(["Code dossier", "Code contrat", "Date  acte"])
df2 = df2.reset_index(drop=True)
df2.head()

In [None]:
df2["Sequence"] = ""
df2["Sequence_other"] = "" 
df2["Sequence_1"] = ""
        
number_acte = 5
list_dos = []
list_seq = []
list_seq_1 = []
list_seq_other = []
number_dos = 1
number_dos_total = df2["Code dossier"].nunique()
for i in range(len(df2)-1,-1,-1):
    cd_dossier = df2.loc[i, "Code dossier"]
    if cd_dossier not in list_dos:
        list_dos.append(cd_dossier)
        print(f"--------------------------------------Dossier n° {number_dos} sur {number_dos_total} en cours--------------------------------------")
        number_dos += 1
        list_acte = []
        seq = ""
        compter = 0
        for j in range(i,-1,-1):
            if compter == number_acte:
                break
            elif compter < number_acte:
                cd_dossier_bis = df2.loc[j, "Code dossier"]
                if cd_dossier_bis == cd_dossier:
                    if df2.loc[j, "Code prestation"] not in list_acte:
                        list_acte.append(df2.loc[j, "Code prestation"])
                        seq = str(df2.loc[j, "Code prestation"]) + ' ' + seq
                        compter +=1
        if len(seq) >= number_acte *4 :
            list_seq.append(seq)
            list_seq_1.append([x for x in seq.split(" ") if x !=""][-1])
            list_seq_other.append(seq.replace([x for x in seq.split(" ") if x !=""][-1], ""))
            df2.loc[i, "Sequence"] = seq
            df2.loc[i, "Sequence_other"] = seq.replace([x for x in seq.split(" ") if x !=""][-1], "")
            df2.loc[i, "Sequence_1"] = [x for x in seq.split(" ") if x !=""][-1]
        elif len(seq) < number_acte * 4 :
            list_seq.append("")
            list_seq_1.append("")
            list_seq_other.append("")
            df2.loc[i, "Sequence"] = ""
            df2.loc[i, "Sequence_other"] = ""
            df2.loc[i, "Sequence_1"] = ""
    elif cd_dossier in list_dos:
        ind = list_dos.index(cd_dossier)
        df2.loc[i, "Sequence"] = list_seq[ind] 
        df2.loc[i, "Sequence_other"] = list_seq_other[ind] 
        df2.loc[i, "Sequence_1"] = list_seq_1[ind] 
        

In [None]:
df2.tail(15)

In [None]:
df2.head(15)

In [None]:
df2['Sequence'].nunique()

In [None]:
df2[['Sequence', 'Sequence_other', 'Sequence_1']].tail()

In [None]:
df2["Code dossier"].unique()

In [None]:
df2['Sequence_1'].value_counts(normalize=True)

In [None]:
df2[df2['Sequence_1'] == ""]

In [None]:
df3 = df2.drop_duplicates(subset=['Sequence'])
df3

# TF-IDF

In [None]:
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit(df3['Sequence'])

In [None]:
seq_vec = tfidf.transform(df3['Sequence'])

In [None]:
pickle.dump(tfidf,open(pathlib.Path(MODELS_DIR +'\\tfidf.p'),'wb'))

In [None]:
seq_vec.shape

In [None]:
seq_vec

# PCA

In [None]:
pca = PCA(n_components=20)
pca.fit(seq_vec.toarray())

In [None]:
print(pca.explained_variance_ratio_)

In [None]:
pca.explained_variance_ratio_.sum()

In [None]:
pickle.dump(pca,open(pathlib.Path(MODELS_DIR +'\\pca.p'),'wb'))

In [None]:
seq_pca = pca.transform(seq_vec.toarray())

In [None]:
seq_pca.shape

# Clustering

## K-means

In [None]:
X_inertia_values = []
X_silhouette_scores = []
for i in range (2,11):
    print(i)
    X_inertia, X_label, X_centroids, X_silhouette = KMeans_Algorithm(seq_pca, i)
    X_inertia_values.append(X_inertia)
    X_silhouette_scores.append(X_silhouette)

In [None]:
plt.figure(figsize = (10 ,6))
plt.plot(np.arange(2,11) , X_inertia_values )
plt.xlabel("Number of Clusters")
plt.ylabel("Inertia Values")

In [None]:
plt.figure(figsize=(10,6))
plt.plot(np.arange(2,11), X_silhouette_scores)
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Score")

In [None]:
df3["Cluster"] = KMeans_Algorithm(seq_pca, 3)[1]

In [None]:
clustering_KMeans = KMeans(n_clusters= 3, random_state=234, init='k-means++', max_iter=150, algorithm = "elkan")
clustering_KMeans.fit(seq_pca)

In [None]:
pickle.dump(clustering_KMeans,open(pathlib.Path(MODELS_DIR +'\\kmeans.p'),'wb'))

In [None]:
df3.head()

In [None]:
df3.groupby(["Cluster"])["Age"].describe()

In [None]:
df3.groupby(["Cluster"])["Total montant reglement en devise"].describe()

In [None]:
df3.groupby(["Cluster"])["Solde compte client"].describe()

In [None]:
df3.Cluster.value_counts()

In [None]:
(df3.groupby(["Code sexe"])["Cluster"].value_counts(normalize=True)*100).unstack(level=0).plot(kind='bar')
title = "Sexe"
plt.title(title)

In [None]:
(df3.groupby(["Classe de tarif"])["Cluster"].value_counts(normalize=True)*100).to_csv('cluster_tarif.csv')

In [None]:
df3.groupby(["Cluster"])["Ville de l’adhérent"].value_counts(normalize=True)*100

# RNN

In [None]:
X = to_categorical(df3[df3["Sequence_1"].notna()]["Sequence_other"].astype('category').cat.codes)
len(X)

In [None]:
Y = to_categorical(df3[df3["Sequence_1"].notna()]["Sequence_1"].astype('category').cat.codes)
len(Y)

In [None]:
with open(PATH_CLASSES, 'w') as f:
    for name in [df3[df3["Sequence_1"].notna()]["Sequence_1"].astype('category').cat.categories][0]:
        f.write('%s\n' %name)
    f.close()

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(np.array([np.array(x) for x in X]), np.array([np.array(y) for y in Y]), test_size=0.2)

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
# gpus = tf.config.list_physical_devices('GPU')
# if gpus:
#   # Restrict TensorFlow to only allocate 1GB of memory on the first GPU
#   try:
#     tf.config.set_logical_device_configuration(
#         gpus[0],
#         [tf.config.LogicalDeviceConfiguration(memory_limit=512)])
#     logical_gpus = tf.config.list_logical_devices('GPU')
#     print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
#   except RuntimeError as e:
#     # Virtual devices must be set before GPUs have been initialized
#     print(e)

In [None]:
try:
    # Disable all GPUS
    tf.config.set_visible_devices([], 'GPU')
    visible_devices = tf.config.get_visible_devices()
    for device in visible_devices:
        assert device.device_type != 'GPU'
except:
    # Invalid device or cannot modify virtual devices once initialized.
    pass

In [None]:
# model = Sequential()
# model.add(Embedding(37, 10, input_length=1493))
# model.add(Bidirectional(LSTM(64)))
# model.add(Dropout(0.2))
# model.add(Dense(37, activation='softmax'))

In [None]:
# model = Sequential()
# model.add(LSTM(64, input_shape=(X_train.shape[1],1), return_sequences=True))
# model.add(Dropout(0.2))
# model.add(LSTM(64))
# model.add(Dropout(0.2))
# model.add(Dense(37, activation='softmax'))

In [None]:
model = Sequential()
model.add(LSTM(64, input_shape=(1493, 1)))
model.add(Dense(37, activation='softmax'))

In [None]:
callbacks = [
            EarlyStopping(monitor='val_accuracy', patience=10, restore_best_weights=True, verbose=1, mode="max"),
            ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, min_lr=0.00001, verbose=1),
            # ModelCheckpoint('model.h5', monitor='accuracy', mode='max', verbose=1, save_best_only=True, save_weights_only=True)
            ]

# optimizer = RMSprop(learning_rate=0.01)
optimizer=tf.keras.optimizers.Adam(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

history = model.fit(X_train, y_train, batch_size=128, epochs=20, callbacks=callbacks,\
                validation_data=(X_valid, y_valid))

In [None]:
model.summary()

In [None]:
model.save(pathlib.Path(MODELS_DIR +'\\rnn.h5'))

In [None]:
plot_history(history)

In [None]:
y_pred = model.predict(X_valid)

In [None]:
y_valid_class = [np.argmax(y, axis=None, out=None) for y in y_valid]

In [None]:
evaluation_metrics(y_pred, y_valid_class)

# Génération du fichier requirements.txt

In [4]:
!pip freeze | findstr "anonymizedf= imread= keras= matplotlib= numpy= pandas= pickle= pil= pyyaml= PyYAML= scikit-learn= scikit-image= seaborn= streamlit= tensorflow-gpu= yaml=" > ../requirements.txt

In [3]:
# !pip freeze > ../requirements.txt