**Pre-processing des données textuelles :**

In [3]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

nltk.download('stopwords')
nltk.download('punkt')

df = pd.read_csv('movie_review.csv')

stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)

def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.lower() not in punctuation]
    tokens = [word for word in tokens if word not in stop_words]
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text


df['text'] = df['text'].apply(preprocess_text)

print(df[['text']])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


                                                    text
0      films adapted comic books plenty success wheth...
1      starters created alan moore eddie campbell bro...
2      say moore campbell thoroughly researched subje...
3      book `` graphic novel `` 500 pages long includ...
4                          words n't dismiss film source
...                                                  ...
64715    lack inspiration traced back insipid characters
64716  like many skits current incarnation _saturday_...
64717  watching one `` roxbury `` skits snl come away...
64718                         bump unsuspecting women 's
64719  watching _a_night_at_the_roxbury_ 'll left exa...

[64720 rows x 1 columns]


**Entraînement du modèle Word2Vec**

In [8]:
from gensim.models import Word2Vec


# texts_tokens = [text.split() for text in df['text']]
# model = Word2Vec([texts_tokens], vector_size=100, window=5, min_count=1, sg=1)
# print("exemple d'un vecteur : " , model.wv('inspiration'))

text_tokens = [text.split() for text in df['text']]


model = Word2Vec(sentences=text_tokens,vector_size=100, window=5, min_count=1, sg=1)


print(" exemple d'un vecteur d'un mot : ", model.wv['inspiration'])


 exemple d'un vecteur d'un mot :  [-0.23559007  0.40418655  0.11580639  0.21065067  0.02162356 -0.5034731
  0.13404317  0.52630407 -0.21356523 -0.2636159  -0.11000752 -0.35239437
 -0.04663207  0.16588633 -0.09982069 -0.10590639  0.24496631 -0.18828876
 -0.04561688 -0.5620554   0.1135729   0.07947256  0.466287   -0.14504506
  0.09519174  0.18905284 -0.14234002 -0.07610427 -0.3000506  -0.011778
  0.29179734  0.0595244   0.04057891  0.06723082 -0.19593403  0.2853755
  0.22905973  0.03153455 -0.09895346 -0.28737545 -0.06044609 -0.4079276
 -0.17111874 -0.09441631 -0.08158323  0.00140173 -0.11039234 -0.13458629
  0.24698277  0.30997425  0.1099103  -0.35118636 -0.30065376 -0.09958108
 -0.03511606  0.03536397  0.1815625   0.00672357 -0.33256793 -0.0748883
  0.10352629 -0.03633954  0.05016884 -0.17300726 -0.29831767  0.18979692
  0.17587441  0.1454907  -0.28186393  0.4250123  -0.17378578  0.09468421
  0.26998743 -0.08534052  0.22991167  0.11889467  0.09849129  0.15451247
 -0.2955756   0.0991365

**Vectorisation des reviews de movies**

In [11]:
import numpy as np


def get_review_vector(review, model, vector_size):

    words = [word for word in review.split() if word in model.wv.key_to_index]
    if not words:
        return np.zeros(vector_size)
    word_vectors = [model.wv[word] for word in words]
    review_vector = np.mean(word_vectors, axis=0)
    return review_vector

df['vector_review'] = df['text'].apply(lambda x: get_review_vector(x, model, 100))

print(df[['text', 'vector_review']])

                                                    text  \
0      films adapted comic books plenty success wheth...   
1      starters created alan moore eddie campbell bro...   
2      say moore campbell thoroughly researched subje...   
3      book `` graphic novel `` 500 pages long includ...   
4                          words n't dismiss film source   
...                                                  ...   
64715    lack inspiration traced back insipid characters   
64716  like many skits current incarnation _saturday_...   
64717  watching one `` roxbury `` skits snl come away...   
64718                         bump unsuspecting women 's   
64719  watching _a_night_at_the_roxbury_ 'll left exa...   

                                           vector_review  
0      [-0.2619613, 0.44805953, 0.07471598, 0.1266571...  
1      [-0.19811782, 0.31752104, 0.06400895, 0.121749...  
2      [-0.10082244, 0.50200546, 0.19527158, 0.084221...  
3      [-0.31283447, 0.3323612, 0.13528155,

**Division des données**

In [21]:
from sklearn.model_selection import train_test_split


train_data, test_data = train_test_split(df, test_size=0.2)
X_train = np.vstack(train_data['vector_review'].values)
y_train = train_data['tag']
X_test = np.vstack(test_data['vector_review'].values)
y_test = test_data['tag']

**Construction d'un classificateur**

In [25]:
from sklearn.linear_model import LogisticRegression


logistic_model = LogisticRegression(max_iter=1000, random_state=42)


logistic_model.fit(X_train, y_train)


y_predictions = logistic_model.predict(X_test)

print(y_predictions)

['pos' 'pos' 'pos' ... 'pos' 'pos' 'neg']


**Évaluation du modèle**

In [27]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

accuracy = accuracy_score(y_test, y_predictions)
precision = precision_score(y_test, y_predictions, pos_label='pos')
recall = recall_score(y_test, y_predictions, pos_label='pos')
f1 = f1_score(y_test, y_predictions, pos_label='pos')

print("Accuracy :", accuracy)
print("Precision :", precision)
print("Recall :", recall)
print("F1-score :", f1)

Accuracy : 0.5801143386897404
Precision : 0.5819773006973882
Recall : 0.6415435634609586
F1-score : 0.610310461031046
