# **Import**

In [10]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# **Load Data**

In [11]:
# Chargement des données
data = pd.read_csv("/kaggle/input/movie-review/movie_review.csv")

# **Pre-processing des données textuelles**

In [12]:
# Pre-processing des données textuelles
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text.lower())
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return words

data['preprocessed_text'] = data['text'].apply(preprocess_text)

[nltk_data] Error loading stopwords: <urlopen error [Errno -3]
[nltk_data]     Temporary failure in name resolution>


# **modèle Word2Vec**

In [13]:
# Entraînement du modèle Word2Vec
word2vec_model = Word2Vec(sentences=data['preprocessed_text'], vector_size=100, window=5, min_count=1, workers=4)

# **Vectorisation**

In [14]:
# Vectorisation des reviews de movies
def vectorize_text(text):
    word_vectors = [word2vec_model.wv[word] for word in text if word in word2vec_model.wv]
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(word2vec_model.vector_size)

data['vectorized_text'] = data['preprocessed_text'].apply(vectorize_text)

# **Division des données**

In [15]:
# Division des données
X = np.array(data['vectorized_text'].to_list())
y = (data['tag'] == 'pos').astype(int)  # Conversion des étiquettes en valeurs numériques

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# **Construction d'un classificateur**

In [16]:
# Construction d'un classificateur
model = LogisticRegression(max_iter=1000)
# model = RandomForestClassifier()
# model = SVC()

model.fit(X_train, y_train)

# **Évaluation du modèle**

In [17]:
# Évaluation du modèle
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

Accuracy: 0.57
Precision: 0.57
Recall: 0.68
F1 Score: 0.62
