### Détectection des faux billets avec Python

## 1. Preliminary

     *** Contexte ***

Identification des contrefaçons des billets en euros

Les billets d'euro ont des valeurs nominales de 5, 10, 20, 50, 100, 200 et 500 euros. 

In [23]:
!python -V


Python 3.9.13


## 1.1 Install

In [24]:
#

# 1.2 Importation

## 1.2.1 Importation des librairies

In [25]:
#builtin
import os

In [26]:

#data
import pandas as pd
import numpy as np

In [44]:

#visualisation 
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import missingno as msno
%matplotlib inline

In [43]:
#machine learning
from sklearn.linear_model import LinearRegression

## 2.1 chargement des fichiers

In [28]:
input_folder = r"C:\Users\Nada\Desktop\OpenClassRooms\Projet 10\Data\0 Source"
df_orig= pd.read_csv(os.path.join(input_folder, "billets.csv"), dtype=float, sep=';')

## 2. Prétraitement des données

## 2.2 Exploration

In [29]:
#Affichage des 5 premieres lignes
df_orig.head()

Unnamed: 0,is_genuine,diagonal,height_left,height_right,margin_low,margin_up,length
0,1.0,171.81,104.86,104.95,4.52,2.89,112.83
1,1.0,171.46,103.36,103.66,3.77,2.99,113.09
2,1.0,172.69,104.48,103.5,4.4,2.94,113.16
3,1.0,171.36,103.91,103.94,3.62,3.01,113.51
4,1.0,171.73,104.28,103.46,4.04,3.48,112.54


In [30]:
#Affichage des 5 dernieres lignes
df_orig.tail()

Unnamed: 0,is_genuine,diagonal,height_left,height_right,margin_low,margin_up,length
1495,0.0,171.75,104.38,104.17,4.42,3.09,111.28
1496,0.0,172.19,104.63,104.44,5.27,3.37,110.97
1497,0.0,171.8,104.01,104.12,5.51,3.36,111.95
1498,0.0,172.06,104.28,104.06,5.17,3.46,112.25
1499,0.0,171.47,104.15,103.82,4.63,3.37,112.07


In [31]:
#Affichage de 5 lignes arbitrairement
df_orig.sample(5)

Unnamed: 0,is_genuine,diagonal,height_left,height_right,margin_low,margin_up,length
590,1.0,171.85,103.58,104.17,4.32,3.07,113.59
1057,0.0,172.02,104.43,104.37,5.59,3.06,110.35
208,1.0,171.59,103.8,104.08,4.18,3.17,113.04
1125,0.0,172.04,104.26,103.87,5.21,3.49,112.09
114,1.0,172.24,103.97,103.69,4.32,2.98,113.26


In [32]:
#Dimensions du DataFrame
df_orig.shape

(1500, 7)

In [33]:
#Information sur les colonnes
df_orig.dtypes

is_genuine      float64
diagonal        float64
height_left     float64
height_right    float64
margin_low      float64
margin_up       float64
length          float64
dtype: object

In [34]:
#Nombre des valeurs manquantes dans chaque colonne
df_orig.isnull().sum()

is_genuine       0
diagonal         0
height_left      0
height_right     0
margin_low      37
margin_up        0
length           0
dtype: int64

On a 37 valeurs manquantes dans la colonne margin_low

In [35]:
#Statistiques descriptives
df_orig.describe().round(2)

Unnamed: 0,is_genuine,diagonal,height_left,height_right,margin_low,margin_up,length
count,1500.0,1500.0,1500.0,1500.0,1463.0,1500.0,1500.0
mean,0.67,171.96,104.03,103.92,4.49,3.15,112.68
std,0.47,0.31,0.3,0.33,0.66,0.23,0.87
min,0.0,171.04,103.14,102.82,2.98,2.27,109.49
25%,0.0,171.75,103.82,103.71,4.01,2.99,112.03
50%,1.0,171.96,104.04,103.92,4.31,3.14,112.96
75%,1.0,172.17,104.23,104.15,4.87,3.31,113.34
max,1.0,173.01,104.88,104.95,6.9,3.91,114.44


In [36]:
#Nombre des doublons dans le DataFrame
df_orig.duplicated().sum()

0

In [37]:
#Nombre des doublons sans le target
df_orig.drop(columns="is_genuine").duplicated().sum()

0

In [38]:
#Information sur les valeurs unique dans chaque colonne
df_orig.nunique()

is_genuine        2
diagonal        159
height_left     155
height_right    170
margin_low      285
margin_up       123
length          336
dtype: int64

In [39]:
#Nombre des valeurs dans la target
df_orig['is_genuine'].value_counts()

is_genuine
1.0    1000
0.0     500
Name: count, dtype: int64

In [45]:
#creation de la colonne target
df_orig.rename(columns={'is_genuine':'target'})

Unnamed: 0,target,diagonal,height_left,height_right,margin_low,margin_up,length
0,1.0,171.81,104.86,104.95,4.52,2.89,112.83
1,1.0,171.46,103.36,103.66,3.77,2.99,113.09
2,1.0,172.69,104.48,103.50,4.40,2.94,113.16
3,1.0,171.36,103.91,103.94,3.62,3.01,113.51
4,1.0,171.73,104.28,103.46,4.04,3.48,112.54
...,...,...,...,...,...,...,...
1495,0.0,171.75,104.38,104.17,4.42,3.09,111.28
1496,0.0,172.19,104.63,104.44,5.27,3.37,110.97
1497,0.0,171.80,104.01,104.12,5.51,3.36,111.95
1498,0.0,172.06,104.28,104.06,5.17,3.46,112.25


## 2.3 Nettoyage des données

### 2.3.1 Imputation des valeurs manquantes

In [40]:
df=df_orig.copy()

In [None]:
## 

In [41]:
1/0

ZeroDivisionError: division by zero

In [None]:
# Créer un masque booléen pour identifier les valeurs manquantes
mask_missing_values = df.isna().any(axis=1)

# Utiliser le masque pour indexer le DataFrame et afficher les lignes avec des valeurs manquantes
rows_with_missing_values = df.loc[mask_missing_values]

rows_with_missing_values.head()


Unnamed: 0,is_genuine,diagonal,height_left,height_right,margin_low,margin_up,length
72,1.0,171.94,103.89,103.45,,3.25,112.79
99,1.0,171.93,104.07,104.18,,3.14,113.08
151,1.0,172.07,103.8,104.38,,3.02,112.93
197,1.0,171.45,103.66,103.8,,3.62,113.27
241,1.0,171.83,104.14,104.06,,3.02,112.36


In [None]:
rows_with_missing_values.nunique()

is_genuine       2
diagonal        32
height_left     30
height_right    30
margin_low       0
margin_up       29
length          33
dtype: int64

les valeurs manquantes appartiennent aux deux types des billets authentique et faux.

les valeurs manquantes sont de type continue

In [None]:

df.drop(['is_genuine'])

KeyError: "['is_genuine'] not found in axis"

In [None]:
#Choix de la methode regression lineaire pour l'imputation des valeurs manquantes

from sklearn.linear_model import LinearRegression

In [None]:
# Séparer les données en deux ensembles : avec et sans valeurs manquantes

test_data = df[df['margin_low'].isna()] #DataFrame qui contient que les valeurs manquantes

train_data = df[~df['margin_low'].isna()] #dataframe sans valeurs manquantes

In [None]:
test_data.shape

(37, 7)

In [None]:
train_data.shape

(1463, 7)

In [None]:
# Diviser les données en variables explicatives (X) et la variable cible (y)
#les variables explicative (X)
X_train = train_data.drop(columns=['margin_low'])
X_train

Unnamed: 0,is_genuine,diagonal,height_left,height_right,margin_up,length
0,1.0,171.81,104.86,104.95,2.89,112.83
1,1.0,171.46,103.36,103.66,2.99,113.09
2,1.0,172.69,104.48,103.50,2.94,113.16
3,1.0,171.36,103.91,103.94,3.01,113.51
4,1.0,171.73,104.28,103.46,3.48,112.54
...,...,...,...,...,...,...
1495,0.0,171.75,104.38,104.17,3.09,111.28
1496,0.0,172.19,104.63,104.44,3.37,110.97
1497,0.0,171.80,104.01,104.12,3.36,111.95
1498,0.0,172.06,104.28,104.06,3.46,112.25


In [None]:
# Diviser les données en variables explicatives (X) et la variable cible (y)
#la variable cible (y) (target)
y_train= train_data['margin_low']
y_train.shape

(1463,)

In [None]:
## Model
# Créer et entraîner le modèle de régression linéaire
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
#creation de la variable X_test
X_test = test_data.drop(columns=['margin_low'])

In [None]:
# Prédire les valeurs manquantes
y_predicted = model.predict(X_test)

In [None]:
y_predicted.shape

(37,)

In [None]:
# Remplacer les valeurs manquantes par les valeurs prédites
df.loc[df['margin_low'].isna(), 'margin_low'] = y_predicted

In [None]:
y_predicted

array([4.06495361, 4.11199026, 4.13400328, 3.99357074, 4.1403993 ,
       4.09428392, 4.07412432, 4.12538999, 4.0807278 , 4.07363322,
       4.11897255, 4.18037978, 4.13648423, 4.05106842, 4.17837685,
       4.22555104, 4.11586845, 4.10284101, 4.08184346, 4.09276238,
       4.11250192, 4.15717623, 4.16028787, 4.12193808, 4.12353555,
       4.19842271, 4.10962313, 4.09696025, 4.13384101, 5.25968515,
       5.264817  , 5.28251853, 5.30206887, 5.20035843, 5.1754678 ,
       5.17345045, 5.24675055])

In [None]:
y_predicted.shape

(37,)

In [None]:
df.isnull().sum()

is_genuine      0
diagonal        0
height_left     0
height_right    0
margin_low      0
margin_up       0
length          0
dtype: int64

## 2.3.2 Detection des outliers

## 2.3.1.1 Data mining

## 2.4 Preparation des données