# Identification des contrefaçons des billets en euros

In [1]:
import pandas as pd
import matplotlib as plt
import seaborn as sns
import numpy as np
from pickle import load

from sklearn import cluster
from sklearn.cluster import KMeans # KMeans clustering 

from sklearn.linear_model import LogisticRegression

# Modélisation avec K-means

In [7]:

def prediction_kmeans(path) :
    
    print("Chargement du CSV",path)
    print("")
    df_test=pd.read_csv(path)
    features = df_test.loc[:, 'diagonal':'length'].columns
    X = df_test.drop("id", axis=1)
    
    print("Input columns {}".format(X.columns))
    print("")
    df_test = df_test.set_index("id")
    
    #X = df_test.drop(['id'], axis=1).values
    
    
    scaler = load(open('./Assets/scaler.pkl', 'rb'))

    X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=features)
    print("Chargement du scaler:",scaler) 
    print("")
    
    # Chargement du modèle
    km_fit = load(open('./Assets/km_fit.pkl','rb'))
    print("Chargement du modèle utilisé:", km_fit)
    print("")
    # Faire de nouvelles prédictions    

    results_kmean = pd.DataFrame(km_fit.fit_transform(X_scaled))
    cluster = km_fit.labels_
    
    predictions = pd.DataFrame(cluster).copy()
    dict_predictions = load(open('./Assets/dict_predictions.pkl','rb'))
    predictions = predictions.replace(dict_predictions)
    
   

    # Création df_resultat
    df_test = df_test.reset_index()
    resultats=df_test.merge(pd.DataFrame(cluster), left_index=True, right_index=True)
    resultats = resultats.rename(columns= {0:"cluster"})
    df_resultats=predictions.merge(pd.DataFrame(resultats), left_index=True, right_index=True)
    df_resultats = df_resultats.rename(columns= {0:"predictions"})

  
    df_resultats.cluster = df_resultats.cluster.astype(int)
    print("")
    print("**Résultat de la modélisation**")
    print("")
    print('% de faux billets :',round(((len(df_resultats) - sum(df_resultats['cluster']))*100)/len(df_resultats)),'%')
    print("")

    print(df_resultats[["id","cluster","predictions"]])
    print("")

    print(df_resultats.groupby(["cluster","predictions"]).agg({"cluster":"count"}))
    
    #sns.stripplot(x=df_resultats["id"],y=df_resultats["PC1"],hue=cluster, jitter=False, size=13)
    #sns.stripplot(x=X[:,0],y=X[:,1],hue=cluster, jitter=False, size=13)

  
    df_resultats
# Plot the scatter digram
    #sns.scatterplot(centers[:,0],centers[:,1],hue=cluster) 
    
    
    return pd.DataFrame(df_resultats.round(5))

In [8]:
prediction_kmeans('./Data/billets_production.csv')

Chargement du CSV ./Data/billets_production.csv

Input columns Index(['diagonal', 'height_left', 'height_right', 'margin_low', 'margin_up',
       'length'],
      dtype='object')

Chargement du scaler: StandardScaler()

Chargement du modèle utilisé: KMeans(n_clusters=2, random_state=42)


**Résultat de la modélisation**

% de faux billets : 60 %

    id  cluster predictions
0  A_1        0       False
1  A_2        0       False
2  A_3        0       False
3  A_4        1        True
4  A_5        1        True

                     cluster
cluster predictions         
0       False              3
1       True               2


Unnamed: 0,predictions,id,diagonal,height_left,height_right,margin_low,margin_up,length,cluster
0,False,A_1,171.76,104.01,103.54,5.21,3.3,111.42,0
1,False,A_2,171.87,104.17,104.13,6.0,3.31,112.09,0
2,False,A_3,172.0,104.58,104.29,4.99,3.39,111.57,0
3,True,A_4,172.49,104.55,104.34,4.44,3.03,113.2,1
4,True,A_5,171.65,103.63,103.56,3.77,3.16,113.33,1


## Nouvelle prédiction

In [18]:
prediction_kmeans('./Data/billets_test.csv')

Chargement du CSV ./Data/billets_test.csv

Input columns Index(['diagonal', 'height_left', 'height_right', 'margin_low', 'margin_up',
       'length'],
      dtype='object')

Chargement du scaler: StandardScaler()

Chargement du modèle utilisé: KMeans(n_clusters=2, random_state=42)


**Résultat de la modélisation**

% de faux billets : 60 %

    id  cluster predictions
0  B_1        1        True
1  B_2        0       False
2  B_3        1        True
3  B_4        0       False
4  B_5        0       False

                     cluster
cluster predictions         
0       False              3
1       True               2


Unnamed: 0,predictions,id,diagonal,height_left,height_right,margin_low,margin_up,length,cluster
0,True,B_1,172.09,103.95,103.73,4.39,3.09,113.19,1
1,False,B_2,171.52,104.17,104.03,5.27,3.16,111.82,0
2,True,B_3,171.78,103.8,103.75,3.81,3.24,113.39,1
3,False,B_4,172.02,104.08,103.99,5.57,3.3,111.1,0
4,False,B_5,171.79,104.34,104.37,5.0,3.07,111.87,0


# Modélisation avec la Régression logistique

In [9]:
class LogisticRegressionWithThreshold(LogisticRegression):
    def predict(self, X, threshold=None):
        if threshold == None: # If no threshold passed in, simply call the base class predict, effectively threshold=0.5
            return LogisticRegression.predict(self, X)
        else:
            y_scores = LogisticRegression.predict_proba(self, X)[:, 1]
            y_pred_with_threshold = (y_scores <= threshold).astype(int)

            return y_pred_with_threshold


In [10]:

def prediction_reg_log(path) :
    
    print("Chargement du CSV",path)
    print("")
    df_test=pd.read_csv(path)
    columns = load(open('./Assets/model-columns.pkl','rb'))
    print(columns)
    X = df_test.drop("id", axis=1)
    features = X.columns

    print("Input columns {}".format(X.columns))
    print("")

    # Scaler:
    scaler = load(open('./Assets/scaler.pkl', 'rb'))
    X_scaled = pd.DataFrame(scaler.transform(X), columns=features)
    print("Chargement du scaler:",scaler)
    print("")
        
    # Faire de nouvelles prédictions 
    model = load(open('./Assets/model_P10.pkl', 'rb'))
    Y = model.predict(X_scaled)
    print("Chargement du modèle utilisé:", model)

    
    predictions = pd.DataFrame(Y).copy()
    dict_predictions = load(open('./Assets/dict_predictions.pkl','rb'))
    predictions = predictions.replace(dict_predictions)
    
    df_resultats = pd.DataFrame({"pred" : Y, "id": df_test["id"]})
    df_resultats = df_resultats.merge(pd.DataFrame(predictions),left_index=True, right_index=True)
    df_resultats = df_resultats.rename(columns= {0:"prediction"})
    resultats=df_resultats.merge(pd.DataFrame(X), left_index=True, right_index=True)
    print("")
    print("")
    print("**Résultat de la modélisation**")
    print("")

    print('% de faux billets :',round(((len(df_resultats) - sum(df_resultats['pred']))*100)/len(df_resultats)),'%')

    print("")
    print(df_resultats)
    print("")
    print(df_resultats.groupby(["pred","prediction"]).agg({"id":"count"}))
    print("")
    return resultats

In [11]:
prediction_reg_log('./Data/billets_production.csv')

Chargement du CSV ./Data/billets_production.csv

Index(['diagonal', 'height_left', 'height_right', 'margin_low', 'margin_up',
       'length'],
      dtype='object')
Input columns Index(['diagonal', 'height_left', 'height_right', 'margin_low', 'margin_up',
       'length'],
      dtype='object')

Chargement du scaler: StandardScaler()

Chargement du modèle utilisé: LogisticRegressionWithThreshold(penalty='l1', solver='liblinear')


**Résultat de la modélisation**

% de faux billets : 60 %

   pred   id prediction
0     0  A_1      False
1     0  A_2      False
2     0  A_3      False
3     1  A_4       True
4     1  A_5       True

                 id
pred prediction    
0    False        3
1    True         2



Unnamed: 0,pred,id,prediction,diagonal,height_left,height_right,margin_low,margin_up,length
0,0,A_1,False,171.76,104.01,103.54,5.21,3.3,111.42
1,0,A_2,False,171.87,104.17,104.13,6.0,3.31,112.09
2,0,A_3,False,172.0,104.58,104.29,4.99,3.39,111.57
3,1,A_4,True,172.49,104.55,104.34,4.44,3.03,113.2
4,1,A_5,True,171.65,103.63,103.56,3.77,3.16,113.33


## Nouvelle prédiction

In [19]:
prediction_reg_log('./Data/billets_test.csv')

Chargement du CSV ./Data/billets_test.csv

Index(['diagonal', 'height_left', 'height_right', 'margin_low', 'margin_up',
       'length'],
      dtype='object')
Input columns Index(['diagonal', 'height_left', 'height_right', 'margin_low', 'margin_up',
       'length'],
      dtype='object')

Chargement du scaler: StandardScaler()

Chargement du modèle utilisé: LogisticRegressionWithThreshold(penalty='l1', solver='liblinear')


**Résultat de la modélisation**

% de faux billets : 60 %

   pred   id prediction
0     1  B_1       True
1     0  B_2      False
2     1  B_3       True
3     0  B_4      False
4     0  B_5      False

                 id
pred prediction    
0    False        3
1    True         2



Unnamed: 0,pred,id,prediction,diagonal,height_left,height_right,margin_low,margin_up,length
0,1,B_1,True,172.09,103.95,103.73,4.39,3.09,113.19
1,0,B_2,False,171.52,104.17,104.03,5.27,3.16,111.82
2,1,B_3,True,171.78,103.8,103.75,3.81,3.24,113.39
3,0,B_4,False,172.02,104.08,103.99,5.57,3.3,111.1
4,0,B_5,False,171.79,104.34,104.37,5.0,3.07,111.87
