In [4]:
import pandas as pd
import mlflow
import re
import nltk
import pickle
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SMOTE
# from mlfow.models import infer_signature

In [5]:
df = pd.read_csv('incidents_reseau.csv')

df

Unnamed: 0,ID d'incident,Timestamp,Type d'incident,Description,Severité,Statut,Responsable
0,1,2023-09-24,Intrusion,échec mise jour logiciel,Élevée,Ouvert,Equipe Réseau
1,2,2023-07-12,Panne matérielle,mise jour firmware a échoué,Élevée,En cours,Equipe Sécurité
2,3,2023-05-26,Intrusion,carte réseau ne répond plus,Faible,En cours,Equipe Réseau
3,4,2023-04-16,Intrusion,échec mise jour logiciel,Moyenne,Ouvert,Equipe Support
4,5,2023-10-22,Intrusion,échec mise jour logiciel,Faible,Ouvert,Equipe Réseau
...,...,...,...,...,...,...,...
2995,2996,2023-10-09,Mise à jour échouée,connexion au vpn a échoué,Faible,Ouvert,Equipe Support
2996,2997,2023-11-14,Mise à jour échouée,malware a été trouvé sur serveur,Faible,Résolu,Equipe Support
2997,2998,2023-12-13,Problème de connexion,capacité bande passante dépassée,Critique,Résolu,Equipe Sécurité
2998,2999,2023-03-20,Mise à jour échouée,échec mise jour logiciel,Critique,Résolu,Equipe Support


## Data Preprocessing

In [6]:
# making time data into time format

df['Timestamp'] = pd.to_datetime(df['Timestamp'])

df['Timestamp']

0      2023-09-24
1      2023-07-12
2      2023-05-26
3      2023-04-16
4      2023-10-22
          ...    
2995   2023-10-09
2996   2023-11-14
2997   2023-12-13
2998   2023-03-20
2999   2023-05-03
Name: Timestamp, Length: 3000, dtype: datetime64[ns]

In [7]:
# normalizing text data


nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Define the stop words
stop_words = set(stopwords.words('french'))

# Initialize the stemmer and lemmatizer
# stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()



def preprocess(text): 
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    tokens = word_tokenize(text)  # Tokenize text
    filtered_tokens = [word for word in tokens if word not in stop_words]  # Remove stop words
    # stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]  # Stem tokens
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]  # Lemmatize tokens
    return ' '.join(lemmatized_tokens)  # Join tokens back into a single string

   
    
df['Description'] = df['Description'].apply(preprocess)

df['Description']

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\melbey\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\melbey\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\melbey\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


0               échec mise jour logiciel
1            mise jour firmware a échoué
2               carte réseau répond plus
3               échec mise jour logiciel
4               échec mise jour logiciel
                      ...               
2995              connexion vpn a échoué
2996            malware a trouvé serveur
2997    capacité bande passante dépassée
2998            échec mise jour logiciel
2999        impossible daccéder internet
Name: Description, Length: 3000, dtype: object

# TF-IDF 

In [8]:
tfidf_vectorizer = TfidfVectorizer()

tfidf_matrix = tfidf_vectorizer.fit_transform(df['Description'])

# Convert the TF-IDF matrix to a DataFrame 
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

print(tfidf_matrix)
tfidf_df


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 11404 stored elements and shape (3000, 46)>
  Coords	Values
  (0, 44)	0.5809488806457799
  (0, 26)	0.4031109004683641
  (0, 23)	0.4031109004683641
  (0, 24)	0.5809488806457799
  (1, 26)	0.43384892607498365
  (1, 23)	0.43384892607498365
  (1, 17)	0.6100283207955893
  (1, 45)	0.5014136680571044
  (2, 5)	0.5
  (2, 33)	0.5
  (2, 32)	0.5
  (2, 30)	0.5
  (3, 44)	0.5809488806457799
  (3, 26)	0.4031109004683641
  (3, 23)	0.4031109004683641
  (3, 24)	0.5809488806457799
  (4, 44)	0.5809488806457799
  (4, 26)	0.4031109004683641
  (4, 23)	0.4031109004683641
  (4, 24)	0.5809488806457799
  (5, 34)	0.376975036955205
  (5, 31)	0.5347553401673897
  (5, 6)	0.5347553401673897
  (5, 18)	0.5347553401673897
  (6, 0)	0.5773502691896257
  :	:
  (2992, 27)	0.490821352467345
  (2992, 1)	0.490821352467345
  (2993, 34)	0.4502938559510357
  (2993, 25)	0.6313617993245821
  (2993, 41)	0.6313617993245821
  (2994, 34)	0.4384151864032215
  (2994, 43)	0.63552

Unnamed: 0,accès,autorisée,bande,base,capacité,carte,cessé,charge,connexion,daccéder,...,surchargé,suspect,sécurité,tentative,tombé,trouvé,vpn,web,échec,échoué
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.580949,0.000000
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.501414
2,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.580949,0.000000
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.580949,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.477067,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.684117,0.0,0.000000,0.551716
2996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.631362,0.000000,0.0,0.000000,0.000000
2997,0.0,0.0,0.5,0.0,0.5,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000
2998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.580949,0.000000


In [9]:
s = set(df['Type d\'incident'])
s

{'Intrusion',
 'Mise à jour échouée',
 'Panne matérielle',
 'Problème de connexion',
 'Surcharge'}

# Train Test split

In [10]:
X = tfidf_matrix

y = df['Type d\'incident']

# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)


# Models

In [11]:


# Define the models and their respective parameter grids for Grid Search
models = {
    "Multinomial Naive Bayes": (MultinomialNB(), {'alpha': [0.1, 0.5, 1.0, 2.0]}),
    "k-Nearest Neighbors": (KNeighborsClassifier(), {'n_neighbors': [3, 5, 7, 9], 'weights': ['uniform', 'distance']}),
    "Random Forest": (RandomForestClassifier(), {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20, 30]}),
    "Decision Tree": (DecisionTreeClassifier(), {'max_depth': [None, 10, 20, 30], 'criterion': ['gini', 'entropy']}),
    "MLP Classifier (Neural Network)": (MLPClassifier(max_iter=1000), {'hidden_layer_sizes': [(50,), (100,), (50, 50)], 'activation': ['relu', 'tanh'], 'solver': ['adam', 'sgd']})
}

# To store the results
results = []
models_best_grid = []


# Train and evaluate each model
for name, (model, param_grid) in models.items():


    print(f"Training {name} with Grid Search...")
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    models_best_grid.append(best_model)
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    results.append({
        'Model': name,
        'Best Parameters': grid_search.best_params_,
        'Accuracy': accuracy,
        'Precision': report['weighted avg']['precision'],
        'Recall': report['weighted avg']['recall'],
        'F1-Score': report['weighted avg']['f1-score']
    })
    print(f"Best Parameters for {name}: {grid_search.best_params_}")
    print(f"Accuracy: {accuracy}\n")
    print(f"Classification Report:\n{classification_report(y_test, y_pred)}\n")
    print("-" * 80)

    # saving as pickle files
    
    # model_filename = f'{name.replace(" ", "_").lower()}_model.pkl'
    # with open(model_filename, 'wb') as file:
    #     pickle.dump(best_model, file)


naive_model = models_best_grid[0]
knn_model = models_best_grid[1]
forest_model = models_best_grid[2]
tree_model = models_best_grid[3]
mlp_model = models_best_grid[4]


Training Multinomial Naive Bayes with Grid Search...
Best Parameters for Multinomial Naive Bayes: {'alpha': 0.1}
Accuracy: 0.21782178217821782

Classification Report:
                       precision    recall  f1-score   support

            Intrusion       0.22      0.21      0.21       126
  Mise à jour échouée       0.24      0.28      0.26       129
     Panne matérielle       0.17      0.29      0.22       106
Problème de connexion       0.25      0.24      0.25       133
            Surcharge       0.20      0.06      0.10       112

             accuracy                           0.22       606
            macro avg       0.22      0.22      0.21       606
         weighted avg       0.22      0.22      0.21       606


--------------------------------------------------------------------------------
Training k-Nearest Neighbors with Grid Search...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Best Parameters for k-Nearest Neighbors: {'n_neighbors': 3, 'weights': 'uniform'}
Accuracy: 0.23267326732673269

Classification Report:
                       precision    recall  f1-score   support

            Intrusion       0.22      0.44      0.29       126
  Mise à jour échouée       0.26      0.46      0.33       129
     Panne matérielle       0.21      0.15      0.17       106
Problème de connexion       0.22      0.08      0.11       133
            Surcharge       0.00      0.00      0.00       112

             accuracy                           0.23       606
            macro avg       0.18      0.23      0.18       606
         weighted avg       0.18      0.23      0.19       606


--------------------------------------------------------------------------------
Training Random Forest with Grid Search...
Best Parameters for Random Forest: {'max_depth': 10, 'n_estimators': 50}
Accuracy: 0.21782178217821782

Classification Report:
                       precision    recall

In [12]:
# Convert results to DataFrame and display
results_df = pd.DataFrame(results)
print(results_df)
results_df.to_csv("resultat.csv")

                             Model  \
0          Multinomial Naive Bayes   
1              k-Nearest Neighbors   
2                    Random Forest   
3                    Decision Tree   
4  MLP Classifier (Neural Network)   

                                     Best Parameters  Accuracy  Precision  \
0                                     {'alpha': 0.1}  0.217822   0.220459   
1           {'n_neighbors': 3, 'weights': 'uniform'}  0.232673   0.184868   
2              {'max_depth': 10, 'n_estimators': 50}  0.217822   0.224397   
3          {'criterion': 'entropy', 'max_depth': 10}  0.219472   0.225145   
4  {'activation': 'relu', 'hidden_layer_sizes': (...  0.178218   0.185795   

     Recall  F1-Score  
0  0.217822  0.209448  
1  0.232673  0.186628  
2  0.217822  0.213619  
3  0.219472  0.215831  
4  0.178218  0.167744  


# Pipeline

In [13]:
# cette partie est juste pour le test 
# m = GridSearchCV(LinearSVC(max_iter=1000), cv=5, scoring='accuracy')

pipe = Pipeline([('tfidf', tfidf_vectorizer), ('smth', naive_model)])

X = df['Description']
y = df["Type d\'incident"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


pipe.fit(X_train, y_train)
print(type(X_test))



# with open('pipeline_incident_report.pkl', 'wb') as file:
#     pickle.dump(pipe, file)

<class 'pandas.core.series.Series'>


# mlflow

In [16]:
# Extract the best parameters for the MLP model
best_mlp_params = grid_search.best_params_


mlflow.set_experiment("first attempt")
# Start an MLflow run
with mlflow.start_run():
    # Log the model name
    mlflow.log_param("Model", "MLP Classifier")

    # Log each hyperparameter for the MLP model
    mlflow.log_param("hidden_layer_sizes", best_mlp_params['hidden_layer_sizes'])
    mlflow.log_param("activation", best_mlp_params['activation'])
    mlflow.log_param("solver", best_mlp_params['solver'])
    
    # Log the model's performance metrics
    mlflow.log_metric("Accuracy", accuracy)
    mlflow.log_metric("Precision", report['weighted avg']['precision'])
    mlflow.log_metric("Recall", report['weighted avg']['recall'])
    mlflow.log_metric("F1-Score", report['weighted avg']['f1-score'])


