## GUERRIER OULOTO ET MOULOUD HAMENNI

In [None]:
# Importations et téléchargement des ressources
import pandas as pd
import numpy as np
import re
import os
import pickle
import hashlib
import nltk

# Téléchargement des ressources NLTK (à exécuter une seule fois)
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/guerrierouloto/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/guerrierouloto/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/guerrierouloto/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
# Cellule 2 : Chargement du jeu de données SMS
url = "https://raw.githubusercontent.com/justmarkham/DAT8/master/data/sms.tsv"
df = pd.read_csv(url, sep='\t', header=None, names=['label', 'message'])
df.head()


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
# On mappe 'ham' à 0 et 'spam' à 1
df['label'] = df['label'].map({'ham': 0, 'spam': 1})
df.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
# Fonction de prétraitement
def preprocess_text(text):
    # Conversion en minuscules
    text = text.lower()
    # Suppression des caractères non alphabétiques
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenisation simple (séparation par espace)
    tokens = text.split()
    # Suppression des stop words en anglais (adapté au dataset)
    stop_words = set(stopwords.words('english'))
    tokens = [t for t in tokens if t not in stop_words]
    # Lemmatisation des tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return ' '.join(tokens)

# Application du prétraitement
df['processed_message'] = df['message'].apply(preprocess_text)
df.head()

Unnamed: 0,label,message,processed_message
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,0,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts st ...
3,0,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah dont think go usf life around though


In [5]:
# Séparation des données
X = df['processed_message']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Vectorisation TF-IDF
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [7]:
# Entraînement et évaluation des modèles

# Modèle 1 : Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_tfidf, y_train)
lr_pred = lr.predict(X_test_tfidf)
lr_f1 = f1_score(y_test, lr_pred)

# Modèle 2 : Multinomial Naive Bayes
nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)
nb_pred = nb.predict(X_test_tfidf)
nb_f1 = f1_score(y_test, nb_pred)

# Modèle 3 : Linear SVC
svm = LinearSVC(max_iter=1000)
svm.fit(X_train_tfidf, y_train)
svm_pred = svm.predict(X_test_tfidf)
svm_f1 = f1_score(y_test, svm_pred)

print("F1 Score Logistic Regression:", lr_f1)
print("F1 Score MultinomialNB:", nb_f1)
print("F1 Score Linear SVC:", svm_f1)


F1 Score Logistic Regression: 0.8593155893536122
F1 Score MultinomialNB: 0.8754716981132076
F1 Score Linear SVC: 0.9440559440559441


In [8]:
# Sélection du meilleur modèle selon le F1 score
scores = {'LogisticRegression': lr_f1, 'MultinomialNB': nb_f1, 'LinearSVC': svm_f1}
best_model_name = max(scores, key=scores.get)
print("Meilleur modèle :", best_model_name, "avec F1 score :", scores[best_model_name])

# Sauvegarde du meilleur modèle dans la variable best_model
if best_model_name == 'LogisticRegression':
    best_model = lr
elif best_model_name == 'MultinomialNB':
    best_model = nb
else:
    best_model = svm

Meilleur modèle : LinearSVC avec F1 score : 0.9440559440559441


In [9]:
# Sauvegarde des fichiers et calcul du hash du modèle
# Sauvegarde du meilleur modèle
with open("model.pkl", "wb") as f:
    pickle.dump(best_model, f)

# Sauvegarde du vectorizer TF-IDF
with open("tfidf.pkl", "wb") as f:
    pickle.dump(tfidf, f)

# Calcul du hash SHA256 du fichier model.pkl
h = hashlib.sha256()
with open("model.pkl", "rb") as f:
    while chunk := f.read(8192):
        h.update(chunk)
model_hash = h.hexdigest()

# Sauvegarde du hash dans model_hash.txt
with open("model_hash.txt", "w") as f:
    f.write(model_hash)

print("Meilleur modèle et TF-IDF vectorizer sauvegardés avec succès.")
print("Hash du modèle :", model_hash)


Meilleur modèle et TF-IDF vectorizer sauvegardés avec succès.
Hash du modèle : 4a8899512777fa8b103bfe54503240750532a5c41327a6be1b1d6eb8f1f2d80d
