### Chargement du dataset

In [1]:
import pandas as pd

df = pd.read_csv("MovieReview.csv")
display(df.head())
print(df.shape)

df = df.drop('sentiment', axis=1)

Unnamed: 0,sentiment,review
0,Positive,With all this stuff going down at the moment w...
1,Positive,'The Classic War of the Worlds' by Timothy Hin...
2,Negative,The film starts with a manager (Nicholas Bell)...
3,Negative,It must be assumed that those who praised this...
4,Positive,Superbly trashy and wondrously unpretentious 8...


(25000, 2)


### Nettoyage

In [3]:
import re
import unicodedata
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download()
stop_words = stopwords.words('english')

# Converts the unicode file to ascii
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')

def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)
    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Z?.!]+", " ", w)
    w = re.sub(r'\b\w{0,2}\b', '', w)

    # remove stopword
    mots = word_tokenize(w.strip())
    mots = [mot for mot in mots if mot not in stop_words]
    return ' '.join(mots).strip()

df.review = df.review.apply(lambda x :preprocess_sentence(x))
df.head()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


Unnamed: 0,review
0,stuff going moment started listening music wat...
1,classic war worlds timothy hines entertaining ...
2,film starts manager nicholas bell giving welco...
3,must assumed praised film greatest filmed oper...
4,superbly trashy wondrously unpretentious explo...


### Tokenization

In [5]:
import tensorflow as tf
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000)
tokenizer.fit_on_texts(df.review)

In [6]:
word2idx = tokenizer.word_index
idx2word = tokenizer.index_word
vocab_size = tokenizer.num_words

In [10]:
print (word2idx)
print (idx2word)
print (vocab_size)

10000


### Modélisation

In [9]:
import numpy as np


def sentenceToData(tokens, WINDOW_SIZE):
    window = np.concatenate((np.arange(-WINDOW_SIZE,0),np.arange(1,WINDOW_SIZE+1)))
    X,Y=([],[])
    for word_index, word in enumerate(tokens) :
        if ((word_index - WINDOW_SIZE >= 0) and (word_index + WINDOW_SIZE <= len(tokens) - 1)) :
            X.append(word)
            Y.append([tokens[word_index-i] for i in window])
    return X, Y


WINDOW_SIZE = 5

X, Y = ([], [])
for review in df.review:
    for sentence in review.split("."):
        word_list = tokenizer.texts_to_sequences([sentence])[0]
        if len(word_list) >= WINDOW_SIZE:
            Y1, X1 = sentenceToData(word_list, WINDOW_SIZE//2)
            X.extend(X1)
            Y.extend(Y1)
    
X = np.array(X).astype(int)
y = np.array(Y).astype(int).reshape([-1,1])

In [23]:
print(X[0])
print (y[0])
print (df['review'])
print ([idx2word[tok] for tok in X[0]])
print (idx2word[414])

[2424  499   69  398]
[414]
0        stuff going moment started listening music wat...
1        classic war worlds timothy hines entertaining ...
2        film starts manager nicholas bell giving welco...
3        must assumed praised film greatest filmed oper...
4        superbly trashy wondrously unpretentious explo...
                               ...                        
24995    seems like consideration gone imdb reviews fil...
24996    believe made film . completely unnecessary . f...
24997    guy loser . get girls needs build picked stron...
24998    minute documentary bunuel made early one spain...
24999    saw movie child broke heart ! story unfinished...
Name: review, Length: 25000, dtype: object
['listening', 'started', 'going', 'stuff']
moment


In [24]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, Dense, GlobalAveragePooling1D

embedding_dim = 300
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(GlobalAveragePooling1D())
model.add(Dense(vocab_size, activation='softmax'))

In [27]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X, y, batch_size = 128, epochs=5)

Epoch 1/5
[1m12163/12163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m586s[0m 48ms/step - accuracy: 0.0354 - loss: 7.5599
Epoch 2/5
[1m12163/12163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m590s[0m 49ms/step - accuracy: 0.0604 - loss: 6.9041
Epoch 3/5
[1m12163/12163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m604s[0m 50ms/step - accuracy: 0.0768 - loss: 6.4861
Epoch 4/5
[1m12163/12163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m633s[0m 52ms/step - accuracy: 0.0899 - loss: 6.1628
Epoch 5/5
[1m12163/12163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m649s[0m 53ms/step - accuracy: 0.1014 - loss: 5.8980


<keras.src.callbacks.history.History at 0x1ade951aa20>

In [28]:
model.summary()

### Sauvegarde

In [29]:
model.save("word2vec.h5")

