In [14]:
import pandas as pd

In [43]:
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split

In [41]:
import re
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

In [15]:
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential

In [16]:
data = pd.read_csv(r"C:/Users/Adminetu/Downloads/amazon_review_polarity_csv/train.csv")

In [17]:
data.head()

Unnamed: 0,2,Stuning even for the non-gamer,This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^
0,2,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
1,2,Amazing!,This soundtrack is my favorite music of all ti...
2,2,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...
3,2,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine..."
4,2,an absolute masterpiece,I am quite sure any of you actually taking the...


In [30]:
new_columns = ['note', 'RaisonNote','Comment']

In [31]:
data = pd.read_csv(r"C:/Users/Adminetu/Downloads/amazon_review_polarity_csv/train.csv" , names=new_columns)

In [32]:
data.head()

Unnamed: 0,note,RaisonNote,Comment
0,2,Stuning even for the non-gamer,This sound track was beautiful! It paints the ...
1,2,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
2,2,Amazing!,This soundtrack is my favorite music of all ti...
3,2,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...
4,2,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine..."


In [33]:
data['note'].unique()

array([2, 1], dtype=int64)

either 1 (positive) or 2 (negative) rating.

https://www.kaggle.com/datasets/bhavikardeshna/amazon-customerreviews-polarity

In [34]:
data=data.dropna(subset='note')

In [35]:
data.shape[0]

3600000

In [36]:
data = data.drop_duplicates()

In [37]:
data = data[['note','Comment']]

In [38]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = ''.join([char for char in text if char.isalpha() or char.isspace()])
    return text

In [39]:
data['Comment_clean'] = data['Comment'].apply(clean_text)

In [42]:
# Encoder les étiquettes
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['note']) 
labels = to_categorical(data['label'])

# Réseaux de neurones - LSTM

In [44]:
# Préparer les données pour le modèle
max_words = 10000
max_len = 100
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(data['Comment_clean'])
sequences = tokenizer.texts_to_sequences(data['Comment_clean'])
X = pad_sequences(sequences, maxlen=max_len)
y = labels

In [45]:
# Séparer les données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [47]:
# Construire le modèle
model = Sequential()
model.add(Embedding(max_words, 100))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(64))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dense(y.shape[1], activation='softmax'))

In [48]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [49]:
# Entraîner le modèle
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
[1m72000/72000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9061s[0m 126ms/step - accuracy: 0.8960 - loss: 0.2554 - val_accuracy: 0.9291 - val_loss: 0.1820
Epoch 2/10
[1m72000/72000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9483s[0m 132ms/step - accuracy: 0.9323 - loss: 0.1769 - val_accuracy: 0.9324 - val_loss: 0.1786
Epoch 3/10
[1m72000/72000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10521s[0m 146ms/step - accuracy: 0.9387 - loss: 0.1622 - val_accuracy: 0.9336 - val_loss: 0.1765
Epoch 4/10
[1m72000/72000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9719s[0m 135ms/step - accuracy: 0.9426 - loss: 0.1532 - val_accuracy: 0.9340 - val_loss: 0.1762
Epoch 5/10
[1m72000/72000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29987s[0m 416ms/step - accuracy: 0.9452 - loss: 0.1478 - val_accuracy: 0.9327 - val_loss: 0.1770
Epoch 6/10
[1m72000/72000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32015s[0m 445ms/step - accuracy: 0.9466 - loss: 0.1441 - val_accurac

In [50]:
# Évaluer le modèle
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy}")

[1m22500/22500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1404s[0m 62ms/step - accuracy: 0.9315 - loss: 0.1875
Test Accuracy: 0.9314361214637756


## Support Vector Machine

In [52]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['Comment_clean'])

MemoryError: 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, data['label'], test_size=0.2, random_state=42)

In [51]:
# Entrainement du modèle SVM
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

NameError: name 'SVC' is not defined

In [None]:
# Evaluation
y_pred = svm_model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=le.classes_, zero_division=0))