### Training QSVM model on NSL-KDD dataset with Qiskit

In [2]:
# Importing the necessary modules
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split


Load NSL-KDD data

In [3]:
# Chemins vers les fichiers de dataset NSL-KDD
train_path = 'data/nsl-kdd/KDDTrain+.txt'
test_path = 'data/nsl-kdd/KDDTest+.txt'

# Colonne cibles et caractéristiques
columns = (
    ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent',
     'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root',
     'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login',
     'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
     'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate',
     'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
     'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate',
     'label']
)

# Charger les données
train_data = pd.read_csv(train_path, names=columns, sep=',', header=None, index_col=False)
test_data = pd.read_csv(test_path, names=columns , sep=',', header=None, index_col=False)

# Autoriser l'affichage de toutes les colonnes et lignes
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Afficher les 5 premières lignes du jeu de données
train_data.head()

  train_data = pd.read_csv(train_path, names=columns, sep=',', header=None, index_col=False)
  test_data = pd.read_csv(test_path, names=columns , sep=',', header=None, index_col=False)


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,150,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal
1,0,udp,other,SF,146,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13,1,0.0,0.0,0.0,0.0,0.08,0.15,0.0,255,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal
2,0,tcp,private,S0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,123,6,1.0,1.0,0.0,0.0,0.05,0.07,0.0,255,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune
3,0,tcp,http,SF,232,8153,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,5,5,0.2,0.2,0.0,0.0,1.0,0.0,0.0,30,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal
4,0,tcp,http,SF,199,420,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,30,32,0.0,0.0,0.0,0.0,1.0,0.0,0.09,255,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal


Preprocess NSL-KDD data

In [4]:
# Affiche la distribution des labels
# print(train_data['label'].value_counts())
# print(test_data['label'].value_counts())

# Prétraitement des données
def preprocess_data(data):
    # Convertir les caractéristiques catégorielles en numériques
    le = LabelEncoder()
    for column in ['protocol_type', 'service', 'flag']:
        data[column] = le.fit_transform(data[column])

    # Modifier les int64 en float64
    for column in data.select_dtypes(include=['int64']).columns:
        data[column] = data[column].astype('float64')
    
    # Afficher les types de données
    # print(data.dtypes)

    # Normaliser les caractéristiques numériques
    # scaler = MinMaxScaler()
    # num_cols = data.select_dtypes(include=['float64', 'int64']).columns
    # print(num_cols)
    # data[num_cols] = scaler.fit_transform(data[num_cols])

    # Séparer les caractéristiques et les labels
    X = data.drop('label', axis=1)
    # Label normal alors 0, autre (attaque) alors 1
    y = data['label'].apply(lambda x: 0 if x == 'normal' else 1)
    y = y.astype('float64') # transformer y en float64
    print(pd.Series(y).value_counts()) # montrer la distribution des labels

    return X, y

X_train, y_train = preprocess_data(train_data)
X_test, y_test = preprocess_data(test_data)

# À partir d'ici, vous pouvez utiliser X_train, y_train, X_test, et y_test avec Qiskit pour entraîner votre modèle QML.


label
0.0    67343
1.0    58630
Name: count, dtype: int64
label
1.0    12833
0.0     9711
Name: count, dtype: int64


In [5]:
# Training the model
## Logistic Regression
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Test the model
accuracy = model.score(X_test, y_test)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Recall
from sklearn.metrics import recall_score
y_pred = model.predict(X_test)
recall = recall_score(y_test, y_pred)
print(f'Recall: {recall * 100:.2f}%')

# Precision
from sklearn.metrics import precision_score
precision = precision_score(y_test, y_pred)
print(f'Precision: {precision * 100:.2f}%')

# F1 Score
from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_pred)
print(f'F1 Score: {f1 * 100:.2f}%')

Accuracy: 68.99%
Recall: 52.54%
Precision: 88.22%
F1 Score: 65.86%


In [6]:
## Linear Regression
# from sklearn.linear_model import LinearRegression
# model = LinearRegression()
# model.fit(X_train, y_train)

# # Test the model
# accuracy = model.score(X_test, y_test)
# print(f'Accuracy: {accuracy * 100:.2f}%')

# # Recall
# y_pred = model.predict(X_test)
# y_pred = (y_pred > 0.5)
# recall = recall_score(y_test, y_pred)
# print(f'Recall: {recall * 100:.2f}%')

# # Precision
# precision = precision_score(y_test, y_pred)
# print(f'Precision: {precision * 100:.2f}%')

# # F1 Score
# f1 = f1_score(y_test, y_pred)
# print(f'F1 Score: {f1 * 100:.2f}%')

Accuracy: 19.40%
Recall: 60.77%
Precision: 95.20%
F1 Score: 74.18%


In [7]:
## Random Forest
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Test the model
accuracy = model.score(X_test, y_test)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Recall
from sklearn.metrics import recall_score
y_pred = model.predict(X_test)
recall = recall_score(y_test, y_pred)
print(f'Recall: {recall * 100:.2f}%')

# Precision
from sklearn.metrics import precision_score
precision = precision_score(y_test, y_pred)
print(f'Precision: {precision * 100:.2f}%')

# F1 Score
from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_pred)
print(f'F1 Score: {f1 * 100:.2f}%')

Accuracy: 77.76%
Recall: 62.99%
Precision: 96.83%
F1 Score: 76.33%


In [30]:
## Decision Tree
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(max_depth=4)
model.fit(X_train, y_train)

# Test the model
accuracy = model.score(X_test, y_test)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Recall
from sklearn.metrics import recall_score
y_pred = model.predict(X_test)
recall = recall_score(y_test, y_pred)
print(f'Recall: {recall * 100:.2f}%')

# Precision
from sklearn.metrics import precision_score
precision = precision_score(y_test, y_pred)
print(f'Precision: {precision * 100:.2f}%')

# F1 Score
from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_pred)
print(f'F1 Score: {f1 * 100:.2f}%')

# Enregistrement du modèle
import joblib
joblib.dump(model, 'data/model/dt_model.pkl') # enregistrement du modèle

Accuracy: 82.45%
Recall: 71.67%
Precision: 96.63%
F1 Score: 82.30%


['data/model/dt_model.pkl']

In [9]:
# ## Support Vector Machine
# from sklearn.svm import SVC
# model = SVC()
# model.fit(X_train, y_train)

# # Test the model
# accuracy = model.score(X_test, y_test)
# print(f'Accuracy: {accuracy * 100:.2f}%')

# # Recall
# from sklearn.metrics import recall_score
# y_pred = model.predict(X_test)
# recall = recall_score(y_test, y_pred)
# print(f'Recall: {recall * 100:.2f}%')

# # Precision
# from sklearn.metrics import precision_score
# precision = precision_score(y_test, y_pred)
# print(f'Precision: {precision * 100:.2f}%')

# # F1 Score
# from sklearn.metrics import f1_score
# f1 = f1_score(y_test, y_pred)
# print(f'F1 Score: {f1 * 100:.2f}%')

In [10]:
## K-Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
model.fit(X_train, y_train)

# Test the model
accuracy = model.score(X_test, y_test)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Recall
from sklearn.metrics import recall_score
y_pred = model.predict(X_test)
recall = recall_score(y_test, y_pred)
print(f'Recall: {recall * 100:.2f}%')

# Precision
from sklearn.metrics import precision_score
precision = precision_score(y_test, y_pred)
print(f'Precision: {precision * 100:.2f}%')

# F1 Score
from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_pred)
print(f'F1 Score: {f1 * 100:.2f}%')

Accuracy: 77.20%
Recall: 62.28%
Precision: 96.39%
F1 Score: 75.67%


In [11]:
## Naive Bayes
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X_train, y_train)

# Test the model
accuracy = model.score(X_test, y_test)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Recall
from sklearn.metrics import recall_score
y_pred = model.predict(X_test)
recall = recall_score(y_test, y_pred)
print(f'Recall: {recall * 100:.2f}%')

# Precision
from sklearn.metrics import precision_score
precision = precision_score(y_test, y_pred)
print(f'Precision: {precision * 100:.2f}%')

# F1 Score
from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_pred)
print(f'F1 Score: {f1 * 100:.2f}%')

Accuracy: 45.03%
Recall: 3.69%
Precision: 93.66%
F1 Score: 7.09%


In [34]:
# Load the model
model = joblib.load('data/model/dt_model.pkl') # chargement du modèle

# Utilisation de shap pour expliquer les prédictions
import shap
explainer = shap.Explainer(model, X_train, feature_names=X_train.columns)
shap_values = explainer(X_test)

# Afficher les valeurs shap summary_plot
shap.summary_plot(shap_values, X_test, feature_names=X_test.columns)

ExplainerError: Additivity check failed in TreeExplainer! Please ensure the data matrix you passed to the explainer is the same shape that the model was trained on. If your data shape is correct then please report this on GitHub. This check failed because for one of the samples the sum of the SHAP values was 0.926637, while the model output was 0.101911. If this difference is acceptable you can set check_additivity=False to disable this check.

Creation of the quantum circuit

In [12]:
# Création du circuit quantique avec ZZFeatureMap
feature_map = ZZFeatureMap(feature_dimension=feature_dim, reps=2)


NameError: name 'ZZFeatureMap' is not defined

Configuring the Quantum Instance

In [None]:
backend = Aer.get_backend('statevector_simulator')
quantum_instance = QuantumInstance(backend, shots=1024, seed_simulator=10598, seed_transpiler=10598)


Creating the QSVM model

In [None]:
qsvm = QSVM(feature_map, training_input, test_input, total_array, multiclass_extension=AllPairs())


Model training

In [None]:
result = qsvm.run(quantum_instance)


Model evaluation

In [None]:
print(f'Testing success ratio: {result["testing_accuracy"]}')
