**Importation des bibliothèques nécessaires**

In [6]:
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import spacy
import string
import pickle
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

**Exploration des données**

In [9]:
dataset = pd.read_csv('/content/IMDB Dataset.csv')

In [10]:
dataset.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [15]:
print(f'Rows: {dataset.shape[1]}\nColumns: {dataset.shape[0]}')

Rows: 2
Columns: 68314


In [16]:
print(f'Columns Names: {list(dataset.columns)}')

Columns Names: ['review', 'sentiment']


In [17]:
nlp = English()
stopwords = list(STOP_WORDS)
punctuations = string.punctuation

In [18]:
def tokenizer(sentence):
    mytokens = nlp(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    return mytokens

**Transformation and Vectorization**

In [19]:
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]
    def fit(self, X, y, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}

# Basic function to clean the text
def clean_text(text):
    return text.strip().lower()

In [20]:
vectorizer = CountVectorizer(tokenizer = tokenizer, ngram_range=(1,1))
tfvectorizer = TfidfVectorizer(tokenizer = tokenizer)

In [21]:
X = dataset['review']
y = dataset['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=77)

In [22]:
print(X_train.head())


54858    I love this show. My girlfriend was gonna get ...
38064    You'll probably never see it, but the uncut ve...
31112    This was quite possibly the worst movie I have...
13725    In a movie that follows a struggling actor, pl...
34159    Without a doubt, 12 MONKEYS is one of the best...
Name: review, dtype: object


In [23]:
# Vérifie si certains documents sont vides après nettoyage
print(X_train.isnull().sum())  # Vérifie les valeurs nulles
print(X_train.str.len().describe())  # Vérifie la longueur des textes


0
count    54651.000000
mean      1307.500595
std        984.739663
min         32.000000
25%        699.000000
50%        971.000000
75%       1591.000000
max      13704.000000
Name: review, dtype: float64


In [24]:
def predictors():
    def clean_text(text):
        # Nettoyage de base, convertir en minuscules et enlever la ponctuation
        text = text.lower()
        text = ''.join([char for char in text if char.isalnum() or char.isspace()])
        return text
    return clean_text


In [25]:
from sklearn.base import BaseEstimator, TransformerMixin

class TextCleaner(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        # No fitting needed for text cleaning, so just return self
        return self

    def transform(self, X):
        # Apply your cleaning function to each text entry
        return [clean_text(text) for text in X]  # Assuming clean_text is your cleaning function


In [26]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import pickle
import re

# Custom text cleaner transformer
class TextCleaner(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return [clean_text(text) for text in X]

# Define the text cleaning function
def clean_text(text):
    # Example cleaning steps
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.strip()  # Remove leading/trailing whitespace
    return text

# Define your vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

# Set up the classifier and pipeline
classifier = LogisticRegression()

LRmodel = Pipeline([
    ("cleaner", TextCleaner()),  # Use the custom text cleaner
    ('vectorizer', vectorizer),
    ('classifier', classifier)
])

# Check if there are any empty documents in the training or test sets
print(f'Number of empty documents in X_train: {sum([len(doc) == 0 for doc in X_train])}')
print(f'Number of empty documents in X_test: {sum([len(doc) == 0 for doc in X_test])}')

# Remove empty documents if there are any
X_train = [doc for doc in X_train if len(doc) > 0]
X_test = [doc for doc in X_test if len(doc) > 0]

# Train the Model
LRmodel.fit(X_train, y_train)
LRpred = LRmodel.predict(X_test)

# Print evaluation results
print(f'Confusion Matrix:\n{confusion_matrix(y_test, LRpred)}')
print(f'\nClassification Report:\n{classification_report(y_test, LRpred)}')
print(f'Accuracy: {accuracy_score(y_test, LRpred) * 100}%')

# Save the model



Number of empty documents in X_train: 0
Number of empty documents in X_test: 0
Confusion Matrix:
[[6202  666]
 [ 520 6275]]

Classification Report:
              precision    recall  f1-score   support

    negative       0.92      0.90      0.91      6868
    positive       0.90      0.92      0.91      6795

    accuracy                           0.91     13663
   macro avg       0.91      0.91      0.91     13663
weighted avg       0.91      0.91      0.91     13663

Accuracy: 91.31962233770035%


In [27]:
import os

# Create the directory if it doesn't exist
os.makedirs('/saved_model', exist_ok=True)

# Now save the model
pickle.dump(LRmodel, open('/saved_model/LinearRegression_model.sav', 'wb'))
print('Logistic Regression trained Model Saved')


Logistic Regression trained Model Saved


LSTM

In [32]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.sequence import pad_sequences
from keras.datasets import imdb
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np


# Padding des séquences pour avoir une longueur uniforme
x_train = pad_sequences(x_train, maxlen=200)
x_test = pad_sequences(x_test, maxlen=200)

# Création du modèle LSTM
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=128, input_length=200))
model.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

# Compilation du modèle
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Entraînement du modèle
model.fit(x_train, y_train, epochs=5, batch_size=64, validation_data=(x_test, y_test))

# Prédiction sur les données de test
predictions = model.predict(x_test)

# Conversion des prédictions en classes (0 ou 1)
y_pred = (predictions > 0.5).astype(int)

# Calcul des métriques
accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

Epoch 1/5
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m343s[0m 845ms/step - accuracy: 0.7023 - loss: 0.5508 - val_accuracy: 0.8206 - val_loss: 0.4053
Epoch 2/5
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m369s[0m 811ms/step - accuracy: 0.8641 - loss: 0.3301 - val_accuracy: 0.8470 - val_loss: 0.3613
Epoch 3/5
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m318s[0m 814ms/step - accuracy: 0.8932 - loss: 0.2715 - val_accuracy: 0.8493 - val_loss: 0.4116
Epoch 4/5
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m323s[0m 816ms/step - accuracy: 0.9010 - loss: 0.2536 - val_accuracy: 0.8499 - val_loss: 0.3711
Epoch 5/5
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m320s[0m 812ms/step - accuracy: 0.9050 - loss: 0.2423 - val_accuracy: 0.8377 - val_loss: 0.4136
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 80ms/step
Accuracy: 0.8377
Precision: 0.8731
Recall: 0.7903
F1-Score: 0.8296


**LinearSVC**

In [37]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
import pickle

# Préparation des données
X = dataset['review']
y = dataset['sentiment']

# Division train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=77)

# Vérification des dimensions
print("Dimensions initiales:")
print("X_train:", X_train.shape)
print("y_train:", y_train.shape)

# Création d'une classe transformateur personnalisée
class TextCleaner(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        cleaned_text = []
        for text in X:
            # Si vous aviez une fonction de nettoyage précédente, insérez-la ici
            cleaned_text.append(text)
        return cleaned_text

# Création du pipeline
SVCclassifier = LinearSVC(random_state=77)
SVCmodel = Pipeline([
    ("cleaner", TextCleaner()),
    ('vectorizer', vectorizer),
    ('classifier', SVCclassifier)
])

# Entraînement du modèle
SVCmodel.fit(X_train, y_train)

# Prédictions
SVCpred = SVCmodel.predict(X_test)

# Affichage des métriques
print("\nRésultats:")
print(f'Confusion Matrix:\n{confusion_matrix(y_test,SVCpred)}')
print(f'\nClassification Report:\n{classification_report(y_test,SVCpred)}')
print(f'Accuracy: {accuracy_score(y_test,SVCpred)*100:.2f}%')



Dimensions initiales:
X_train: (54651,)
y_train: (54651,)

Résultats:
Confusion Matrix:
[[6397  471]
 [ 418 6377]]

Classification Report:
              precision    recall  f1-score   support

    negative       0.94      0.93      0.94      6868
    positive       0.93      0.94      0.93      6795

    accuracy                           0.93     13663
   macro avg       0.93      0.93      0.93     13663
weighted avg       0.93      0.93      0.93     13663

Accuracy: 93.49%
