In [32]:
# librairies

# data
import numpy as np
import pandas as pd

# modelisation
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

### <font color="lightgreen">1. Import et vérification des données

In [33]:
# Importer
data_prod = pd.read_csv("df_final.csv")
data_test = pd.read_csv("banknote_test.csv")  # check file name

# split file - include all variables for K-means
#x_train_kmeans = data_prod[['diagonal', 'height_left', 'height_right', 'margin_low', 'margin_up', 'length']].values
#x_test_kmeans = data_test[['diagonal', 'height_left', 'height_right', 'margin_low', 'margin_up', 'length']].values

# split file - include only variables that are statistically significant for Logistic Regession (p-value < 5%)
x_train_logreg = data_prod[['height_right', 'margin_low', 'margin_up', 'length']].values
y_train = data_prod['is_genuine']
x_prod_logreg = data_prod[['height_right', 'margin_low', 'margin_up', 'length']].values
x_test_logreg = data_test[['height_right', 'margin_low', 'margin_up', 'length']].values

# banknotes = data.index
dimensions = data_prod.columns.drop('is_genuine')

print("Table test: \n")
# check for missing values in test dataframe
if (data_test.loc[data_test.isna().any(axis=1) == True].shape[0]) != 0:
    print("!!! Valeurs manquantes !!!")
else:
    print ("Pas de valeurs manquantes.")
# check for missing data in test dataframe
if (data_test.loc[data_test.isnull().any(axis=1) == True].shape[0]) != 0:
    print("!!! Valeurs nulles !!! ")
else:
    print ("Pas de valeurs nulles.")
# check for duplicates in test dataframe
if (data_test.loc[data_test.duplicated() == True].shape[0]) != 0:
    print("!!! Doublons dans la table !!!")
else:
    print ("Pas de doublons.")
    
# check for data input size
if (data_test.shape != (5,7)):
    print("Mauvais format de donnees.")
else:
     print("Format de donnees ok. \n")

#print(f"Nous avons x_train_kmeans de forme {x_train_kmeans.shape}.")
#print(f"Nous avons x_test_kmeans de forme {x_test_kmeans.shape}. \n")

print(f"Nous avons x_train_logreg de forme {x_train_logreg.shape} et y_train de forme {y_train.shape}")
unique_yt, counts_yt = np.unique(y_train, return_counts=True)
print("True/False mix dans le training set pour la regression logistique:\n", np.asarray((unique_yt, counts_yt)).T)
print(f"Nous avons x_test_logreg de forme {x_test_logreg.shape}.")

# center & reduce data
#std_scaler_kmeans = StandardScaler()
std_scaler_logreg = StandardScaler()

# fit StandardScaler on training set only to prevent data leakage
#x_train_kmeans_scaled = std_scaler_kmeans.fit_transform(x_train_kmeans)
x_train_logreg_scaled = std_scaler_logreg.fit_transform(x_train_logreg)

# transform test set with Standard Scaler
#x_test_kmeans_scaled = std_scaler_kmeans.transform(x_test_kmeans)
x_test_logreg_scaled = std_scaler_logreg.transform(x_test_logreg)

Table test: 

Pas de valeurs manquantes.
Pas de valeurs nulles.
Pas de doublons.
Format de donnees ok. 

Nous avons x_train_logreg de forme (1500, 4) et y_train de forme (1500,)
True/False mix dans le training set pour la regression logistique:
 [[   0  500]
 [   1 1000]]
Nous avons x_test_logreg de forme (5, 4).


### <font color="lightgreen">2 - Détecteur par Régression Logistique</font>

#### <font color='lightgreen'> 2.1 - Entrainement du modèle</font>

In [34]:
# fit model to train data using optimised hyper-parameters
best_params_saga = {'C': 0.001, 'class_weight': None, 'max_iter': 500, 'penalty': None, 'random_state': 42,
                    'solver': 'saga'}
estimator_final = LogisticRegression(**best_params_saga)
estimator_final.fit(x_train_logreg_scaled, y_train.values.ravel())
y_pred_final = estimator_final.predict(x_train_logreg_scaled)



#### <font color='lightgreen'> 2.2 - Application du modèle aux données de production</font>

In [35]:
# predict banknote type on test data
results = data_test.copy()
results['proba'] = estimator_final.predict_proba(x_test_logreg_scaled)[:, 1].ravel()
results.loc[results['proba'] > 0.5, 'labels_pred_reglog'] = True
results.loc[results['proba'] <= 0.5, 'labels_pred_reglog'] = False
results

Unnamed: 0,diagonal,height_left,height_right,margin_low,margin_up,length,id,proba,labels_pred_reglog
0,171.76,104.01,103.54,5.21,3.3,111.42,A_1,4.2e-05,False
1,171.87,104.17,104.13,6.0,3.31,112.09,A_2,4e-06,False
2,172.0,104.58,104.29,4.99,3.39,111.57,A_3,1.9e-05,False
3,172.49,104.55,104.34,4.44,3.03,113.2,A_4,0.996669,True
4,171.65,103.63,103.56,3.77,3.16,113.33,A_5,0.999988,True


#### <font color='lightgreen'> 2.3 - Impression des résultats</font>

In [36]:
# print results
print("Resultats:\n")
for i, j in zip(results["labels_pred_reglog"], results["id"]):
    if i == 1:
        print(f"Le billet ref. {j} est un vrai billet.")
    else:
        print(f"Le billet ref. {j} est un faux billet.")

Resultats:

Le billet ref. A_1 est un faux billet.
Le billet ref. A_2 est un faux billet.
Le billet ref. A_3 est un faux billet.
Le billet ref. A_4 est un vrai billet.
Le billet ref. A_5 est un vrai billet.
