In [1]:
# Cette cellule sert à importer des libraires (ou packages) Python
# qui contiennent des fonctions dont nous avons besoin

import numpy as np, pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

# Importation du jeu de données


In [2]:

from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
rt_iot2022 = fetch_ucirepo(id=942) 
  
# data (as pandas dataframes) 
X = rt_iot2022.data.features 
y = rt_iot2022.data.targets 
  
# metadata 
print(rt_iot2022.metadata) 
  
# variable information 
print(rt_iot2022.variables) 


{'uci_id': 942, 'name': 'RT-IoT2022 ', 'repository_url': 'https://archive.ics.uci.edu/dataset/942/rt-iot2022', 'data_url': 'https://archive.ics.uci.edu/static/public/942/data.csv', 'abstract': 'The RT-IoT2022, a proprietary dataset derived from a real-time IoT infrastructure, is introduced as a comprehensive resource integrating a diverse range of IoT devices and sophisticated network attack methodologies. This dataset encompasses both normal and adversarial network behaviours, providing a general representation of real-world scenarios.\nIncorporating data from IoT devices such as ThingSpeak-LED, Wipro-Bulb, and MQTT-Temp, as well as simulated attack scenarios involving Brute-Force SSH attacks, DDoS attacks using Hping and Slowloris, and Nmap patterns, RT-IoT2022 offers a detailed perspective on the complex nature of network traffic. The bidirectional attributes of network traffic are meticulously captured using the Zeek network monitoring tool and the Flowmeter plugin. Researchers can

In [3]:
y.value_counts(dropna=True, ascending=True)

Attack_type               
NMAP_FIN_SCAN                    28
Metasploit_Brute_Force_SSH       37
Wipro_bulb                      253
DDOS_Slowloris                  534
NMAP_TCP_scan                  1002
NMAP_OS_DETECTION              2000
NMAP_XMAS_TREE_SCAN            2010
NMAP_UDP_SCAN                  2590
MQTT_Publish                   4146
ARP_poisioning                 7750
Thing_Speak                    8108
DOS_SYN_Hping                 94659
Name: count, dtype: int64

## Binarisation en True et False

In [4]:
normal =['MQTT_publish','Thing_Speak','Wipro_bulb','Amazon-Alexa']

is_normal = y.isin(normal)
is_normal

Unnamed: 0,Attack_type
0,False
1,False
2,False
3,False
4,False
...,...
123112,False
123113,False
123114,False
123115,False


In [5]:
is_normal_binary = is_normal.value_counts()
is_normal_binary

Attack_type
False          114756
True             8361
Name: count, dtype: int64

In [6]:
is_normal_true = is_normal[is_normal ].value_counts()
is_normal_true

Attack_type
True           8361
Name: count, dtype: int64

In [7]:
is_normal_false = is_normal[~is_normal ].value_counts()
is_normal_false

Attack_type
False          114756
Name: count, dtype: int64

## Nettoyage des features


In [8]:
X_numeric = X.select_dtypes(include=[np.number])
X_numeric

Unnamed: 0,id.orig_p,id.resp_p,flow_duration,fwd_pkts_tot,bwd_pkts_tot,fwd_data_pkts_tot,bwd_data_pkts_tot,fwd_pkts_per_sec,bwd_pkts_per_sec,flow_pkts_per_sec,...,active.avg,active.std,idle.min,idle.max,idle.tot,idle.avg,idle.std,fwd_init_window_size,bwd_init_window_size,fwd_last_window_size
0,38667,1883,32.011598,9,5,3,3,0.281148,0.156193,0.437341,...,2.282415e+06,0.0,29729182.96,29729182.96,29729182.96,29729182.96,0.0,64240,26847,502
1,51143,1883,31.883584,9,5,3,3,0.282277,0.156821,0.439097,...,2.028307e+06,0.0,29855277.06,29855277.06,29855277.06,29855277.06,0.0,64240,26847,502
2,44761,1883,32.124053,9,5,3,3,0.280164,0.155647,0.435811,...,2.281904e+06,0.0,29842149.02,29842149.02,29842149.02,29842149.02,0.0,64240,26847,502
3,60893,1883,31.961063,9,5,3,3,0.281593,0.156440,0.438033,...,2.047288e+06,0.0,29913774.97,29913774.97,29913774.97,29913774.97,0.0,64240,26847,502
4,51087,1883,31.902362,9,5,3,3,0.282111,0.156728,0.438839,...,2.087657e+06,0.0,29814704.90,29814704.90,29814704.90,29814704.90,0.0,64240,26847,502
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123112,59247,63331,0.000006,1,1,0,0,167772.160000,167772.160000,335544.320000,...,5.960464e+00,0.0,0.00,0.00,0.00,0.00,0.0,1024,0,1024
123113,59247,64623,0.000007,1,1,0,0,144631.172400,144631.172400,289262.344800,...,6.914139e+00,0.0,0.00,0.00,0.00,0.00,0.0,1024,0,1024
123114,59247,64680,0.000006,1,1,0,0,167772.160000,167772.160000,335544.320000,...,5.960464e+00,0.0,0.00,0.00,0.00,0.00,0.0,1024,0,1024
123115,59247,65000,0.000006,1,1,0,0,167772.160000,167772.160000,335544.320000,...,5.960464e+00,0.0,0.00,0.00,0.00,0.00,0.0,1024,0,1024


# Séparation du jeu de données


In [9]:
from sklearn.model_selection import train_test_split

train_X, val_X, train_y, val_y = train_test_split(X_numeric, is_normal,test_size=0.7, random_state = 0)

# Affichage des résultats
print("train_X:", train_X)
print("val_X:", val_X)
print("train_y:", train_y)
print("val_y:", val_y)

train_X:         id.orig_p  id.resp_p  flow_duration  fwd_pkts_tot  bwd_pkts_tot  \
37854       19906         21       0.000004             1             1   
36364       18416         21       0.000004             1             1   
106235      57305         21       0.000004             1             1   
59366       41490         21       0.000004             1             1   
8063        35626         80       0.871233             7             5   
...           ...        ...            ...           ...           ...   
45891       27943         21       0.000005             1             1   
117952      55904       2717       0.000003             1             1   
42613       24665         21       0.000001             1             1   
43567       25619         21       0.000001             1             1   
68268       50394         21       0.000002             1             1   

        fwd_data_pkts_tot  bwd_data_pkts_tot  fwd_pkts_per_sec  \
37854                   

## Decision Tree Classifier


In [10]:
#Définition du modèle
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_absolute_error

model = DecisionTreeClassifier()

#Entrainement du modèle
model.fit(train_X, train_y)

#Prédiction
predictions = model.predict(val_X)

# Évaluation du modèle
score = model.score(val_X, val_y)

#Evaluation de la précision
accuracy = accuracy_score(val_y, predictions)

#Evaluation de la précision équilibrée
balanced_accuracy = balanced_accuracy_score(val_y, predictions)

#Evaluation de l'aire sous la courbe ROC
roc_auc = roc_auc_score(val_y, predictions)




print(predictions)
print("Le score du modèle est de : ", score)
print("La précision du modèle est de : ", accuracy)
print("La précision équilibrée du modèle est de : ", balanced_accuracy)
print("L'aire sous la courbe ROC du modèle est de : ", roc_auc)



[False False False ... False False False]
Le score du modèle est de :  0.9979926202687336
La précision du modèle est de :  0.9979926202687336
La précision équilibrée du modèle est de :  0.9919763429207364
L'aire sous la courbe ROC du modèle est de :  0.9919763429207366


# Random Forest Classifier

In [11]:
#Définition du modèle
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import roc_auc_score


model = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)

#Entrainement du modèle
model.fit(train_X, train_y)

#Prédiction
predictions = model.predict(val_X)

# Évaluation du modèle
score = model.score(val_X, val_y)

#Evaluation de la précision
accuracy = accuracy_score(val_y, predictions)

#Evaluation de la précision équilibrée
balanced_accuracy = balanced_accuracy_score(val_y, predictions)

#Evaluation de l'aire sous la courbe ROC
roc_auc = roc_auc_score(val_y, predictions)




print(predictions)
print("Le score du modèle est de : ", score)
print("La précision du modèle est de : ", accuracy)
print("La précision équilibrée du modèle est de : ", balanced_accuracy)
print("L'aire sous la courbe ROC du modèle est de : ", roc_auc)



  return fit_method(estimator, *args, **kwargs)


[False False False ... False False False]
Le score du modèle est de :  0.9902183750667193
La précision du modèle est de :  0.9902183750667193
La précision équilibrée du modèle est de :  0.9649918120533474
L'aire sous la courbe ROC du modèle est de :  0.9649918120533474


In [14]:
is_normal.shape


(123117, 1)

## SVM Linéaire


In [12]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import roc_auc_score

model = LinearSVC(dual=False,max_iter=2000,)

#Entrainement du modèle
model.fit(train_X, train_y)

#Prédiction
predictions = model.predict(val_X)

# Évaluation du modèle
score = model.score(val_X, val_y)

#Evaluation de la précision
accuracy = accuracy_score(val_y, predictions)

#Evaluation de la précision équilibrée
balanced_accuracy = balanced_accuracy_score(val_y, predictions)

#Evaluation de l'aire sous la courbe ROC
roc_auc = roc_auc_score(val_y, predictions)




display(predictions)
print("Le score du modèle est de : ", score)
print("La précision du modèle est de : ", accuracy)
print("La précision équilibrée du modèle est de : ", balanced_accuracy)
print("L'aire sous la courbe ROC du modèle est de : ", roc_auc)

  y = column_or_1d(y, warn=True)


array([False, False, False, ..., False, False, False])

Le score du modèle est de :  0.9537142326703952
La précision du modèle est de :  0.9537142326703952
La précision équilibrée du modèle est de :  0.7336922092388494
L'aire sous la courbe ROC du modèle est de :  0.7336922092388494


- L'accuracy permet de voir si notre modèle est précis, c'est à dire fait peu d'erreurs.Cependant, cette fonction est a évité lorsque les classes sont déséquilibré comme actuellement.
- La "balanced accuracy" permet de contrer le problème précédent lorsque les classes sont déséquilibrés.
- L'auc permet de savoir si le modèle arrive bel et bien à reperer les différentes.elle doit au minimum être supérieur à 0.5,sinon cela voudrait dire que le modèle est moins précis qu'un tirage aléatoire.

# Avancé : classification multi-classe


In [18]:
from sklearn.model_selection import train_test_split

train_Xmc, val_Xmc, train_ymc, val_ymc = train_test_split(X_numeric, y,test_size=0.7, random_state = 0)

# Affichage des résultats
display("train_X:", train_Xmc)
display("val_X:", val_Xmc)
display("train_y:", train_ymc)
display("val_y:", val_ymc)

'train_X:'

Unnamed: 0,id.orig_p,id.resp_p,flow_duration,fwd_pkts_tot,bwd_pkts_tot,fwd_data_pkts_tot,bwd_data_pkts_tot,fwd_pkts_per_sec,bwd_pkts_per_sec,flow_pkts_per_sec,...,active.avg,active.std,idle.min,idle.max,idle.tot,idle.avg,idle.std,fwd_init_window_size,bwd_init_window_size,fwd_last_window_size
37854,19906,21,0.000004,1,1,1,0,2.467238e+05,2.467238e+05,4.934475e+05,...,4.053116,0.0,0.0,0.0,0.0,0.0,0.0,64,0,64
36364,18416,21,0.000004,1,1,1,0,2.467238e+05,2.467238e+05,4.934475e+05,...,4.053116,0.0,0.0,0.0,0.0,0.0,0.0,64,0,64
106235,57305,21,0.000004,1,1,1,0,2.467238e+05,2.467238e+05,4.934475e+05,...,4.053116,0.0,0.0,0.0,0.0,0.0,0.0,64,0,64
59366,41490,21,0.000004,1,1,1,0,2.467238e+05,2.467238e+05,4.934475e+05,...,4.053116,0.0,0.0,0.0,0.0,0.0,0.0,64,0,64
8063,35626,80,0.871233,7,5,2,1,8.034590e+00,5.738993e+00,1.377358e+01,...,873072.147400,0.0,0.0,0.0,0.0,0.0,0.0,64240,26847,501
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45891,27943,21,0.000005,1,1,1,0,1.997288e+05,1.997288e+05,3.994575e+05,...,5.006790,0.0,0.0,0.0,0.0,0.0,0.0,64,0,64
117952,55904,2717,0.000003,1,1,0,0,3.226388e+05,3.226388e+05,6.452775e+05,...,3.099442,0.0,0.0,0.0,0.0,0.0,0.0,64240,0,64240
42613,24665,21,0.000001,1,1,1,0,8.388608e+05,8.388608e+05,1.677722e+06,...,1.192093,0.0,0.0,0.0,0.0,0.0,0.0,64,0,64
43567,25619,21,0.000001,1,1,1,0,1.048576e+06,1.048576e+06,2.097152e+06,...,0.953674,0.0,0.0,0.0,0.0,0.0,0.0,64,0,64


'val_X:'

Unnamed: 0,id.orig_p,id.resp_p,flow_duration,fwd_pkts_tot,bwd_pkts_tot,fwd_data_pkts_tot,bwd_data_pkts_tot,fwd_pkts_per_sec,bwd_pkts_per_sec,flow_pkts_per_sec,...,active.avg,active.std,idle.min,idle.max,idle.tot,idle.avg,idle.std,fwd_init_window_size,bwd_init_window_size,fwd_last_window_size
60766,42890,21,0.000000,1,0,1,0,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,64,0,64
49383,31503,21,0.000002,1,1,1,0,5.242880e+05,5.242880e+05,1.048576e+06,...,1.907349,0.0,0.0,0.0,0.0,0.0,0.0,64,0,64
49424,31544,21,0.000000,1,0,1,0,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,64,0,64
55121,37244,21,0.000001,1,1,1,0,1.048576e+06,1.048576e+06,2.097152e+06,...,0.953674,0.0,0.0,0.0,0.0,0.0,0.0,64,0,64
75683,4043,21,0.000001,1,1,1,0,1.048576e+06,1.048576e+06,2.097152e+06,...,0.953674,0.0,0.0,0.0,0.0,0.0,0.0,64,0,64
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17401,47652,137,0.000000,1,0,1,0,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
53932,36055,21,0.000000,1,0,1,0,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,64,0,64
107933,1958,21,0.000002,1,1,1,0,4.660338e+05,4.660338e+05,9.320676e+05,...,2.145767,0.0,0.0,0.0,0.0,0.0,0.0,64,0,64
24972,7023,21,0.000004,1,1,1,0,2.467238e+05,2.467238e+05,4.934475e+05,...,4.053116,0.0,0.0,0.0,0.0,0.0,0.0,64,0,64


'train_y:'

Unnamed: 0,Attack_type
37854,DOS_SYN_Hping
36364,DOS_SYN_Hping
106235,DOS_SYN_Hping
59366,DOS_SYN_Hping
8063,Thing_Speak
...,...
45891,DOS_SYN_Hping
117952,NMAP_TCP_scan
42613,DOS_SYN_Hping
43567,DOS_SYN_Hping


'val_y:'

Unnamed: 0,Attack_type
60766,DOS_SYN_Hping
49383,DOS_SYN_Hping
49424,DOS_SYN_Hping
55121,DOS_SYN_Hping
75683,DOS_SYN_Hping
...,...
17401,ARP_poisioning
53932,DOS_SYN_Hping
107933,DOS_SYN_Hping
24972,DOS_SYN_Hping


## Decision Tree Classifier (MultiClasse)


In [29]:
#Définition du modèle
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_absolute_error

model = DecisionTreeClassifier()

#Entrainement du modèle
model.fit(train_Xmc, train_ymc)

#Prédiction
predictions = model.predict(val_Xmc)

# Évaluation du modèle
score = model.score(val_Xmc, val_ymc)

#Evaluation de la précision
accuracy = accuracy_score(val_ymc, predictions)

#Evaluation de la précision équilibrée
balanced_accuracy = balanced_accuracy_score(val_ymc, predictions)

#Evaluation de l'aire sous la courbe ROC
#roc_auc = roc_auc_score(val_ymc, predictions)

#Evaluation de la classification
classification = classification_report(val_ymc, predictions)



print(predictions)
print("Le score du modèle est de : ", score)
print("La précision du modèle est de : ", accuracy)
print("La précision équilibrée du modèle est de : ", balanced_accuracy)
#print("L'aire sous la courbe ROC du modèle est de : ", roc_auc)
print("La classification du modèle est de : ", classification)



['DOS_SYN_Hping' 'DOS_SYN_Hping' 'DOS_SYN_Hping' ... 'DOS_SYN_Hping'
 'DOS_SYN_Hping' 'DOS_SYN_Hping']
Le score du modèle est de :  0.996635028196143
La précision du modèle est de :  0.996635028196143
La précision équilibrée du modèle est de :  0.9513169772330968
La classification du modèle est de :                              precision    recall  f1-score   support

            ARP_poisioning       0.98      0.98      0.98      5401
            DDOS_Slowloris       0.97      0.98      0.98       352
             DOS_SYN_Hping       1.00      1.00      1.00     66264
              MQTT_Publish       1.00      1.00      1.00      2944
Metasploit_Brute_Force_SSH       0.58      0.84      0.69        25
             NMAP_FIN_SCAN       1.00      0.81      0.89        21
         NMAP_OS_DETECTION       1.00      1.00      1.00      1393
             NMAP_TCP_scan       0.99      1.00      1.00       684
             NMAP_UDP_SCAN       1.00      0.98      0.99      1820
       NMAP_XMAS_

## Random Tree Classifier (Multiclasse)

In [30]:
#Définition du modèle
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report


model = RandomForestClassifier(n_estimators=500, max_depth=7, random_state=0,max_features=82)

#Entrainement du modèle
model.fit(train_Xmc, train_ymc)

#Prédiction
predictions = model.predict(val_Xmc)

# Évaluation du modèle
score = model.score(val_Xmc, val_ymc)

#Evaluation de la précision
accuracy = accuracy_score(val_ymc, predictions)

#Evaluation de la précision équilibrée
balanced_accuracy = balanced_accuracy_score(val_ymc, predictions)

#Evaluation de l'aire sous la courbe ROC
#roc_auc = roc_auc_score(val_ymc, predictions)

#Evaluation de la classification
classification = classification_report(val_ymc, predictions)




print(predictions)
print("Le score du modèle est de : ", score)
print("La précision du modèle est de : ", accuracy)
print("La précision équilibrée du modèle est de : ", balanced_accuracy)
#print("L'aire sous la courbe ROC du modèle est de : ", roc_auc)
print("La classification du modèle est de : ", classification)



  return fit_method(estimator, *args, **kwargs)


['DOS_SYN_Hping' 'DOS_SYN_Hping' 'DOS_SYN_Hping' ... 'DOS_SYN_Hping'
 'DOS_SYN_Hping' 'DOS_SYN_Hping']
Le score du modèle est de :  0.9942795479334432
La précision du modèle est de :  0.9942795479334432
La précision équilibrée du modèle est de :  0.9141645423163073
La classification du modèle est de :                              precision    recall  f1-score   support

            ARP_poisioning       0.97      0.96      0.96      5401
            DDOS_Slowloris       0.69      0.86      0.77       352
             DOS_SYN_Hping       1.00      1.00      1.00     66264
              MQTT_Publish       1.00      1.00      1.00      2944
Metasploit_Brute_Force_SSH       1.00      0.64      0.78        25
             NMAP_FIN_SCAN       0.89      0.81      0.85        21
         NMAP_OS_DETECTION       1.00      1.00      1.00      1393
             NMAP_TCP_scan       1.00      0.99      0.99       684
             NMAP_UDP_SCAN       0.98      0.97      0.97      1820
       NMAP_XMA

## SVM Linéaire (Multiclasse)

In [26]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import roc_auc_score

model = LinearSVC(dual=False,max_iter=2000,)

#Entrainement du modèle
model.fit(train_Xmc, train_ymc)

#Prédiction
predictions = model.predict(val_Xmc)

# Évaluation du modèle
score = model.score(val_Xmc, val_ymc)

#Evaluation de la précision
accuracy = accuracy_score(val_ymc, predictions)

#Evaluation de la précision équilibrée
balanced_accuracy = balanced_accuracy_score(val_ymc, predictions)

#Evaluation de l'aire sous la courbe ROC
#roc_auc = roc_auc_score(val_ymc, predictions)




display(predictions)
print("Le score du modèle est de : ", score)
print("La précision du modèle est de : ", accuracy)
print("La précision équilibrée du modèle est de : ", balanced_accuracy)
#print("L'aire sous la courbe ROC du modèle est de : ", roc_auc)

  y = column_or_1d(y, warn=True)


array(['Thing_Speak', 'DOS_SYN_Hping', 'Thing_Speak', ...,
       'DOS_SYN_Hping', 'DOS_SYN_Hping', 'DOS_SYN_Hping'], dtype=object)

Le score du modèle est de :  0.8285720916200598
La précision du modèle est de :  0.8285720916200598
La précision équilibrée du modèle est de :  0.5519953203369098


- Precision: Percentage of correct positive predictions relative to total positive predictions.
- Recall: Percentage of correct positive predictions relative to total actual positives.
-  F1 Score: A weighted harmonic mean of precision and recall. The closer to 1, the better the model.
- Support : Nombre appartenant à la classe

https://www.statology.org/sklearn-classification-report/

# Avancé : Mesures de performances

In [1]:
def calculate_accuracy(labels, predictions):
    """
    Calcule l'accuracy à partir des labels et des prédictions.

    :param labels: Liste des labels réels (0 ou 1)
    :param predictions: Liste des prédictions (0 ou 1)
    :return: Accuracy (précision)
    """
    # Vérifier que les listes ont la même longueur
    if len(labels) != len(predictions):
        raise ValueError("Les listes de labels et de prédictions doivent avoir la même longueur.")

    # Calculer le nombre de prédictions correctes
    correct_predictions = sum(1 for label, prediction in zip(labels, predictions) if label == prediction)

    # Calculer l'accuracy
    accuracy = correct_predictions / len(labels)
    return accuracy

# Exemple d'utilisation
labels = [0, 1, 0, 1, 1, 0, 1, 0]
predictions = [0, 1, 0, 0, 1, 0, 1, 1]

accuracy = calculate_accuracy(labels, predictions)
print("Accuracy:", accuracy)

Accuracy: 0.75


In [2]:
def calculate_confusion_matrix_elements(labels, predictions):
    """
    Calcule le nombre de vrais positifs, faux positifs, vrais négatifs et faux négatifs.

    :param labels: Liste des labels réels (0 ou 1)
    :param predictions: Liste des prédictions (0 ou 1)
    :return: Tuple (TP, FP, TN, FN)
    """
    # Vérifier que les listes ont la même longueur
    if len(labels) != len(predictions):
        raise ValueError("Les listes de labels et de prédictions doivent avoir la même longueur.")

    # Initialiser les compteurs
    TP = FP = TN = FN = 0

    # Parcourir les listes et mettre à jour les compteurs
    for label, prediction in zip(labels, predictions):
        if label == 1 and prediction == 1:
            TP += 1
        elif label == 0 and prediction == 1:
            FP += 1
        elif label == 0 and prediction == 0:
            TN += 1
        elif label == 1 and prediction == 0:
            FN += 1

    return TP, FP, TN, FN

# Exemple d'utilisation
labels = [0, 1, 0, 1, 1, 0, 1, 0]
predictions = [0, 1, 0, 0, 1, 0, 1, 1]

TP, FP, TN, FN = calculate_confusion_matrix_elements(labels, predictions)
print("Vrais positifs (TP):", TP)
print("Faux positifs (FP):", FP)
print("Vrais négatifs (TN):", TN)
print("Faux négatifs (FN):", FN)

Vrais positifs (TP): 3
Faux positifs (FP): 1
Vrais négatifs (TN): 3
Faux négatifs (FN): 1


In [3]:
def calculate_sensitivity_specificity(labels, predictions):
    """
    Calcule la sensitivité et la spécificité à partir des labels et des prédictions.

    :param labels: Liste des labels réels (0 ou 1)
    :param predictions: Liste des prédictions (0 ou 1)
    :return: Tuple (sensitivité, spécificité)
    """
    TP, FP, TN, FN = calculate_confusion_matrix_elements(labels, predictions)

    # Calculer la sensitivité
    sensitivity = TP / (TP + FN) if (TP + FN) != 0 else 0

    # Calculer la spécificité
    specificity = TN / (TN + FP) if (TN + FP) != 0 else 0

    return sensitivity, specificity

# Exemple d'utilisation
labels = [0, 1, 0, 1, 1, 0, 1, 0]
predictions = [0, 1, 0, 0, 1, 0, 1, 1]

sensitivity, specificity = calculate_sensitivity_specificity(labels, predictions)
print("Sensitivité:", sensitivity)
print("Spécificité:", specificity)

Sensitivité: 0.75
Spécificité: 0.75


In [4]:
def calculate_balanced_accuracy(labels, predictions):
    """
    Calcule la balanced accuracy à partir des labels et des prédictions.

    :param labels: Liste des labels réels (0 ou 1)
    :param predictions: Liste des prédictions (0 ou 1)
    :return: Balanced accuracy
    """
    sensitivity, specificity = calculate_sensitivity_specificity(labels, predictions)
    balanced_accuracy = (sensitivity + specificity) / 2
    return balanced_accuracy

# Exemple d'utilisation
labels = [0, 1, 0, 1, 1, 0, 1, 0]
predictions = [0, 1, 0, 0, 1, 0, 1, 1]

balanced_accuracy = calculate_balanced_accuracy(labels, predictions)
print("Balanced Accuracy:", balanced_accuracy)

Balanced Accuracy: 0.75


In [6]:
from sklearn.metrics import accuracy_score, balanced_accuracy_score

def calculate_balanced_accuracy(labels, predictions):
    """
    Calcule la balanced accuracy à partir des labels et des prédictions.

    :param labels: Liste des labels réels (0 ou 1)
    :param predictions: Liste des prédictions (0 ou 1)
    :return: Balanced accuracy
    """
    sensitivity, specificity = calculate_sensitivity_specificity(labels, predictions)
    balanced_accuracy = (sensitivity + specificity) / 2
    return balanced_accuracy

# Exemple d'utilisation
labels = [0, 1, 0, 1, 1, 0, 1, 0]
predictions = [0, 1, 0, 0, 1, 0, 1, 1]

# Calcul manuel de l'accuracy
manual_accuracy = sum(1 for label, prediction in zip(labels, predictions) if label == prediction) / len(labels)
print("Manual Accuracy:", manual_accuracy)

# Calcul manuel de la balanced accuracy
manual_balanced_accuracy = calculate_balanced_accuracy(labels, predictions)
print("Manual Balanced Accuracy:", manual_balanced_accuracy)

# Utilisation de sklearn pour valider les résultats
sklearn_accuracy = accuracy_score(labels, predictions)
print("Sklearn Accuracy:", sklearn_accuracy)

sklearn_balanced_accuracy = balanced_accuracy_score(labels, predictions)
print("Sklearn Balanced Accuracy:", sklearn_balanced_accuracy)

# Comparaison des résultats
assert manual_accuracy == sklearn_accuracy, "Les valeurs de l'accuracy ne correspondent pas."
assert manual_balanced_accuracy == sklearn_balanced_accuracy, "Les valeurs de la balanced accuracy ne correspondent pas."

print("Les valeurs calculées manuellement et avec sklearn correspondent.")

Manual Accuracy: 0.75
Manual Balanced Accuracy: 0.75
Sklearn Accuracy: 0.75
Sklearn Balanced Accuracy: 0.75
Les valeurs calculées manuellement et avec sklearn correspondent.


- L'accuracy est le ratio du nombre de prédictions correctes (vrais positifs + vrais négatifs) sur le nombre total de prédictions.
\frac{TP + TN}{TP + TN + FP + FN} 
- La balanced accuracy est la moyenne de la sensitivité (rappel) et de la spécificité. Elle est conçue pour prendre en compte les déséquilibres de classe.