# Models

In [26]:
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.calibration import LabelEncoder
import os
import ipaddress

### Chargement des données

On charge les données et on encode les variable catégorielles

In [27]:
df0=pd.read_csv("../data/final/Credential_Access.csv")
df1=pd.read_csv("../data/final/Defense_Evasion.csv")
df2=pd.read_csv("../data/final/Discovery.csv")
df3=pd.read_csv("../data/final/Exfiltration.csv")
df4=pd.read_csv("../data/final/Initial_Access.csv")
df5=pd.read_csv("../data/final/none.csv")
df6=pd.read_csv("../data/final/Persistence.csv")
df7=pd.read_csv("../data/final/Privilege_Escalation.csv")
df8=pd.read_csv("../data/final/Reconnaissance.csv")
listeDatasets=[df0,df1,df2,df3,df4,df5,df6,df7,df8]

categorical_columns = ['conn_state', 'protocol', 'service', 'history', 'label_tactic']

# 2. Traitement des adresses IP





#Gestion des dates
for i in range(len(listeDatasets)):
    df=listeDatasets[i]
    df['ts'] = pd.to_datetime(df['ts'], unit='s')
    df['year'] = df['ts'].dt.year
    df['month'] = df['ts'].dt.month
    df['day'] = df['ts'].dt.day
    df['hour'] = df['ts'].dt.hour
    df['minute'] = df['ts'].dt.minute
    df['seconde'] = df['ts'].dt.second
    df['dayofweek'] = df['ts'].dt.dayofweek  # 0=lundi, 6=dimanche

    # Caractéristiques cycliques pour l'heure (pour préserver la nature cyclique)
    df['hour_sin'] = np.sin(2 * np.pi * df['hour']/24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour']/24)

    # Caractéristiques cycliques pour le jour de la semaine
    df['dow_sin'] = np.sin(2 * np.pi * df['dayofweek']/7)
    df['dow_cos'] = np.cos(2 * np.pi * df['dayofweek']/7)

    # Trier par timestamp pour calculer les différences temporelles
    df = df.sort_values('ts')
    df['time_since_last'] = df['ts'].diff().dt.total_seconds()

    # Remplacer la première valeur NaN par 0 ou une autre valeur appropriée
    df['time_since_last'] = df['time_since_last'].fillna(0)

    listeDatasets[i]=df.drop(['ts'], axis=1)


# Créer et stocker les encodeurs
encoders = {}

for col in categorical_columns:
    # Collecter toutes les valeurs uniques de TOUS les datasets
    all_values = pd.concat([df[col] for df in listeDatasets]).unique()
    
    # Créer et ajuster l'encodeur sur toutes les valeurs possibles
    encoders[col] = LabelEncoder().fit(all_values)
    
    # Appliquer l'encodeur à chaque dataset
    for df in listeDatasets:
        df[col] = encoders[col].transform(df[col])


Affichage de la structure d'un dataset encodé

In [None]:
listeDatasets[7].head()

Unnamed: 0,conn_state,duration,local_orig,local_resp,protocol,service,history,src_ip,src_port,orig_bytes,...,day,hour,minute,seconde,dayofweek,hour_sin,hour_cos,dow_sin,dow_cos,time_since_last
291,2,0.227828,0,0,1,7,27,143.88.1.18,46377,9421.0,...,28,4,35,51,2,0.866025,0.5,0.974928,-0.222521,0.0
318,4,0.001557,0,0,1,19,10,143.88.11.10,35427,0.0,...,28,5,12,33,2,0.965926,0.258819,0.974928,-0.222521,2202.265337
279,4,0.00064,0,0,1,19,10,143.88.5.14,42603,0.0,...,28,6,17,32,2,1.0,6.123234000000001e-17,0.974928,-0.222521,3899.31078
268,2,0.203343,0,0,1,13,27,143.88.12.12,40721,7577.0,...,28,6,41,47,2,1.0,6.123234000000001e-17,0.974928,-0.222521,1454.400482
253,4,0.001853,0,0,1,19,10,143.88.11.10,43815,0.0,...,28,9,24,58,2,0.707107,-0.7071068,0.974928,-0.222521,9791.222171


### Separation des données

In [28]:
#Diviser chaque dataset en train/test (80/20)
train_dfs = []
test_dfs = []

for df in listeDatasets:
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
    train_dfs.append(train_df)
    test_dfs.append(test_df)

#Concaténer les ensembles
combined_train = pd.concat(train_dfs)
combined_test = pd.concat(test_dfs)

print(combined_train.shape)
print(combined_test.shape)

(1712479, 31)
(428126, 31)


In [None]:
# 6. Définir les features pour le modèle
numerical_features = [
    'duration', 'local_orig', 'local_resp', 'src_port', 'orig_bytes', 
    'orig_pkts', 'orig_ip_bytes', 'dest_port', 'resp_bytes', 'resp_pkts', 
    'resp_ip_bytes', 'missed_bytes'
]

# Ajouter les features temporelles
time_features = ['hour', 'day_of_week', 'day', 'month']
for feat in time_features:
    if feat in combined_train.columns:
        numerical_features.append(feat)

# Ajouter les features d'IP
ip_features = ['src_ip1', 'src_ip2', 'src_ip3', 'src_ip4', 
               'dest_ip1', 'dest_ip2', 'dest_ip3', 'dest_ip4']
for feat in ip_features:
    if feat in combined_train.columns:
        numerical_features.append(feat)

# Ajouter les features catégorielles encodées (sauf la cible)
encoded_categorical = [f'{col}_encoded' for col in categorical_columns if col != 'label_tactic']

# Toutes les features
features = numerical_features + encoded_categorical

# 7. Préparer X et y
X_train = combined_train[features]
y_train = combined_train['label_tactic_encoded']
X_test = combined_test[features]
y_test = combined_test['label_tactic_encoded']

# 8. Créer les datasets LightGBM
categorical_indices = [features.index(col) for col in encoded_categorical]
train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_indices)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# 9. Paramètres LightGBM
num_classes = len(encoders['label_tactic'].classes_)
params = {
    'objective': 'multiclass',
    'num_class': num_classes,
    'metric': 'multi_logloss',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'verbose': -1
}

# 10. Entraîner le modèle
gbm = lgb.train(
    params,
    train_data,
    num_boost_round=100,
    valid_sets=[test_data],
    early_stopping_rounds=10
)

# 11. Évaluer le modèle
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
y_pred_class = np.argmax(y_pred, axis=1)

# Reconvertir en labels textuels pour l'évaluation
y_pred_labels = encoders['label_tactic'].inverse_transform(y_pred_class)
y_test_labels = encoders['label_tactic'].inverse_transform(y_test)

# Afficher les résultats
from sklearn.metrics import classification_report, confusion_matrix
print("\nMatrice de confusion:")
print(confusion_matrix(y_test, y_pred_class))

print("\nRapport de classification:")
print(classification_report(y_test_labels, y_pred_labels))

# Sauvegarder le modèle et les encodeurs
import pickle
model_package = {
    'model': gbm,
    'encoders': encoders,
    'features': features
}

with open('lightgbm_model_package.pkl', 'wb') as f:
    pickle.dump(model_package, f)

KeyboardInterrupt: 