In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SMOTE
import os

In [2]:
df = pd.read_csv('incidents_reseau.csv')

df

Unnamed: 0,ID d'incident,Timestamp,Type d'incident,Description,Severité,Statut,Responsable
0,1,2023-09-24,Intrusion,échec mise jour logiciel,Élevée,Ouvert,Equipe Réseau
1,2,2023-07-12,Panne matérielle,mise jour firmware a échoué,Élevée,En cours,Equipe Sécurité
2,3,2023-05-26,Intrusion,carte réseau ne répond plus,Faible,En cours,Equipe Réseau
3,4,2023-04-16,Intrusion,échec mise jour logiciel,Moyenne,Ouvert,Equipe Support
4,5,2023-10-22,Intrusion,échec mise jour logiciel,Faible,Ouvert,Equipe Réseau
...,...,...,...,...,...,...,...
2995,2996,2023-10-09,Mise à jour échouée,connexion au vpn a échoué,Faible,Ouvert,Equipe Support
2996,2997,2023-11-14,Mise à jour échouée,malware a été trouvé sur serveur,Faible,Résolu,Equipe Support
2997,2998,2023-12-13,Problème de connexion,capacité bande passante dépassée,Critique,Résolu,Equipe Sécurité
2998,2999,2023-03-20,Mise à jour échouée,échec mise jour logiciel,Critique,Résolu,Equipe Support


## Data Preprocessing

In [3]:
# making time data into time format

df['Timestamp'] = pd.to_datetime(df['Timestamp'])

df['Timestamp']

0      2023-09-24
1      2023-07-12
2      2023-05-26
3      2023-04-16
4      2023-10-22
          ...    
2995   2023-10-09
2996   2023-11-14
2997   2023-12-13
2998   2023-03-20
2999   2023-05-03
Name: Timestamp, Length: 3000, dtype: datetime64[ns]

In [4]:
# normalizing text data


nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Define the stop words
stop_words = set(stopwords.words('french'))

# Initialize the stemmer and lemmatizer
# stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()



def preprocess(text): 
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    tokens = word_tokenize(text)  # Tokenize text
    filtered_tokens = [word for word in tokens if word not in stop_words]  # Remove stop words
    # stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]  # Stem tokens
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]  # Lemmatize tokens
    return ' '.join(lemmatized_tokens)  # Join tokens back into a single string

   
    
df['Description'] = df['Description'].apply(preprocess)

df['Description']

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\melbey\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\melbey\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\melbey\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


0               échec mise jour logiciel
1            mise jour firmware a échoué
2               carte réseau répond plus
3               échec mise jour logiciel
4               échec mise jour logiciel
                      ...               
2995              connexion vpn a échoué
2996            malware a trouvé serveur
2997    capacité bande passante dépassée
2998            échec mise jour logiciel
2999        impossible daccéder internet
Name: Description, Length: 3000, dtype: object

# TF-IDF 

In [5]:
tfidf_vectorizer = TfidfVectorizer()

tfidf_matrix = tfidf_vectorizer.fit_transform(df['Description'])

# Convert the TF-IDF matrix to a DataFrame 
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

tfidf_df


Unnamed: 0,accès,autorisée,bande,base,capacité,carte,cessé,charge,connexion,daccéder,...,surchargé,suspect,sécurité,tentative,tombé,trouvé,vpn,web,échec,échoué
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.580949,0.000000
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.501414
2,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.580949,0.000000
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.580949,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.477067,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.684117,0.0,0.000000,0.551716
2996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.631362,0.000000,0.0,0.000000,0.000000
2997,0.0,0.0,0.5,0.0,0.5,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000
2998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.580949,0.000000


In [6]:
s = set(df['Type d\'incident'])
s

{'Intrusion',
 'Mise à jour échouée',
 'Panne matérielle',
 'Problème de connexion',
 'Surcharge'}

# Train Test split

In [7]:
X = tfidf_matrix

y = df['Type d\'incident']

# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Models

In [10]:


# Define the models and their respective parameter grids for Grid Search
models = {
    "Support Vector Classifier (SVC)": (SVC(), {'C': [0.1, 1, 10, 100], 'kernel': ['linear', 'rbf']}),
    "AdaBoost": (AdaBoostClassifier(), {'n_estimators': [50, 100, 200]}),
    "Multinomial Naive Bayes": (MultinomialNB(), {'alpha': [0.1, 0.5, 1.0, 2.0]}),
    "k-Nearest Neighbors": (KNeighborsClassifier(), {'n_neighbors': [3, 5, 7, 9], 'weights': ['uniform', 'distance']}),
    "Random Forest": (RandomForestClassifier(), {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20, 30]}),
    "Decision Tree": (DecisionTreeClassifier(), {'max_depth': [None, 10, 20, 30], 'criterion': ['gini', 'entropy']}),
    "Logistic Regression": (LogisticRegression(max_iter=1000), {'C': [0.1, 1, 10, 100]}),
    "MLP Classifier (Neural Network)": (MLPClassifier(max_iter=1000), {'hidden_layer_sizes': [(50,), (100,), (50, 50)], 'activation': ['relu', 'tanh'], 'solver': ['adam', 'sgd']}),
    "Linear SVC": (LinearSVC(max_iter=1000), {'C': [0.1, 1, 10, 100, 1000]})
}

# To store the results
results = []

# Train and evaluate each model
for name, (model, param_grid) in models.items():
    print(f"Training {name} with Grid Search...")
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    results.append({
        'Model': name,
        'Best Parameters': grid_search.best_params_,
        'Accuracy': accuracy,
        'Precision': report['weighted avg']['precision'],
        'Recall': report['weighted avg']['recall'],
        'F1-Score': report['weighted avg']['f1-score']
    })
    print(f"Best Parameters for {name}: {grid_search.best_params_}")
    print(f"Accuracy: {accuracy}\n")
    print(f"Classification Report:\n{classification_report(y_test, y_pred)}\n")
    print("-" * 80)

# Convert results to DataFrame and display
results_df = pd.DataFrame(results)
print(results_df)



Training Support Vector Classifier (SVC) with Grid Search...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Best Parameters for Support Vector Classifier (SVC): {'C': 0.1, 'kernel': 'rbf'}
Accuracy: 0.17656765676567657

Classification Report:
                       precision    recall  f1-score   support

            Intrusion       0.24      0.09      0.13       126
  Mise à jour échouée       0.00      0.00      0.00       129
     Panne matérielle       0.17      0.75      0.27       106
Problème de connexion       0.00      0.00      0.00       133
            Surcharge       0.20      0.14      0.17       112

             accuracy                           0.18       606
            macro avg       0.12      0.20      0.11       606
         weighted avg       0.12      0.18      0.11       606


--------------------------------------------------------------------------------
Training AdaBoost with Grid Search...




Best Parameters for AdaBoost: {'n_estimators': 50}
Accuracy: 0.21947194719471946

Classification Report:
                       precision    recall  f1-score   support

            Intrusion       0.22      0.14      0.17       126
  Mise à jour échouée       0.24      0.28      0.26       129
     Panne matérielle       0.17      0.29      0.22       106
Problème de connexion       0.25      0.24      0.25       133
            Surcharge       0.23      0.14      0.17       112

             accuracy                           0.22       606
            macro avg       0.22      0.22      0.21       606
         weighted avg       0.23      0.22      0.22       606


--------------------------------------------------------------------------------
Training Multinomial Naive Bayes with Grid Search...
Best Parameters for Multinomial Naive Bayes: {'alpha': 0.1}
Accuracy: 0.21782178217821782

Classification Report:
                       precision    recall  f1-score   support

            

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Best Parameters for k-Nearest Neighbors: {'n_neighbors': 3, 'weights': 'uniform'}
Accuracy: 0.20132013201320131

Classification Report:
                       precision    recall  f1-score   support

            Intrusion       0.21      0.34      0.26       126
  Mise à jour échouée       0.20      0.20      0.20       129
     Panne matérielle       0.17      0.33      0.23       106
Problème de connexion       0.25      0.14      0.18       133
            Surcharge       0.00      0.00      0.00       112

             accuracy                           0.20       606
            macro avg       0.17      0.20      0.17       606
         weighted avg       0.17      0.20      0.18       606


--------------------------------------------------------------------------------
Training Random Forest with Grid Search...
Best Parameters for Random Forest: {'max_depth': 30, 'n_estimators': 100}
Accuracy: 0.21782178217821782

Classification Report:
                       precision    recal

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Best Parameters for MLP Classifier (Neural Network): {'activation': 'relu', 'hidden_layer_sizes': (100,), 'solver': 'sgd'}
Accuracy: 0.17986798679867988

Classification Report:
                       precision    recall  f1-score   support

            Intrusion       0.27      0.18      0.22       126
  Mise à jour échouée       0.14      0.09      0.10       129
     Panne matérielle       0.17      0.42      0.24       106
Problème de connexion       0.00      0.00      0.00       133
            Surcharge       0.17      0.28      0.21       112

             accuracy                           0.18       606
            macro avg       0.15      0.19      0.16       606
         weighted avg       0.15      0.18      0.15       606


--------------------------------------------------------------------------------
Training Linear SVC with Grid Search...
Best Parameters for Linear SVC: {'C': 1}
Accuracy: 0.21782178217821782

Classification Report:
                       precision    

In [11]:
print(results_df)

                             Model  \
0  Support Vector Classifier (SVC)   
1                         AdaBoost   
2          Multinomial Naive Bayes   
3              k-Nearest Neighbors   
4                    Random Forest   
5                    Decision Tree   
6              Logistic Regression   
7  MLP Classifier (Neural Network)   
8                       Linear SVC   

                                     Best Parameters  Accuracy  Precision  \
0                        {'C': 0.1, 'kernel': 'rbf'}  0.176568   0.115837   
1                               {'n_estimators': 50}  0.219472   0.225145   
2                                     {'alpha': 0.1}  0.217822   0.220459   
3           {'n_neighbors': 3, 'weights': 'uniform'}  0.201320   0.172634   
4             {'max_depth': 30, 'n_estimators': 100}  0.217822   0.224397   
5          {'criterion': 'entropy', 'max_depth': 10}  0.219472   0.225145   
6                                         {'C': 100}  0.217822   0.224397   
7  