In [None]:
import pandas as pd
import os

In [None]:
folder = 'aclImdb'

## Lautch if you don't have movie_data.csv

In [None]:
labels = {'pos': 1, 'neg': 0}
df = pd.DataFrame()
for f in ('test', 'train'):    
    for l in ('pos', 'neg'):
        path = os.path.join(folder, f, l)
        for file in os.listdir (path) :
            with open(os.path.join(path, file),'r', encoding='utf-8') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]],ignore_index=True)

In [None]:
df.columns = ['review', 'sentiment']

In [None]:
df.to_csv('movie_data.csv', index=False, encoding='utf-8')

In [None]:
df.head()

## Lautch if you are movie_data.csv

In [None]:
import pandas as pd
import os

In [None]:
df = pd.read_csv('movie_data.csv')

In [None]:
df

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk import word_tokenize,sent_tokenize

In [None]:
reviews = df.review.str.cat(sep=' ')

In [None]:
#function to split text into word
tokens = word_tokenize(reviews)

In [None]:
vocabulary = set(tokens)
print(len(vocabulary))

In [None]:
frequency_dist = nltk.FreqDist(tokens)

# trie les mots dans l'ordre
# sorted(frequency_dist,key=frequency_dist.__getitem__, reverse=True)

In [None]:
from nltk.corpus import stopwords

In [None]:
stop_words = set(stopwords.words('english'))
tokens = [w for w in tokens if not w in stop_words]

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
wordcloud = WordCloud().generate_from_frequencies(frequency_dist)
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

# Data Preprocessing

In [None]:
X_train = df.loc[:24999, 'review'].values
y_train = df.loc[:24999, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)
print(train_vectors.shape, test_vectors.shape)

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(train_vectors, y_train)

In [None]:
from  sklearn.metrics  import accuracy_score
predicted = clf.predict(test_vectors)
print(accuracy_score(y_test,predicted))

# PART 2 : With tensorflow

In [None]:
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [None]:
tokenizer_obj = Tokenizer()

In [None]:
total_reviews =  X_train + X_test

In [None]:
tokenizer_obj.fit_on_texts(total_reviews)

In [None]:
# pad sequences
max_length = max([len(s.split()) for s in total_reviews])

In [None]:
# define vocabulary size
vocab_size = len(tokenizer_obj.word_index) + 1

# Transform data to integer

In [None]:
X_train_tokens = tokenizer_obj.texts_to_sequences(X_train)
X_test_tokens = tokenizer_obj.texts_to_sequences(X_test)

In [None]:
X_train_pad = pad_sequences(X_train_tokens, maxlen=max_length, padding='post')
X_test_pad = pad_sequences(X_test_tokens, maxlen=max_length, padding='post')

## Build model

In [None]:
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense,Dropout,Dropout,LSTM, Reshape, GRU
from tensorflow.python.keras.layers.convolutional import Conv2D
from tensorflow.python.keras.layers.embeddings import Embedding
from tensorflow.python.keras.utils import np_utils
from tensorflow.python.keras.callbacks import TensorBoard

In [None]:
EMBEDDING_DIM = 100

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, EMBEDDING_DIM, input_length=max_length))
model.add(GRU(units=32, dropout=0.2, recurrent_dropout= 0.2))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.compile(loss='binary_crossentropy', optimizer='SGD', metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
# model.fit(X_train_pad,y_train, batch_size=128, epochs=25, validation_data=(X_test_pad,y_test),verbose=2)

In [None]:
test_sample_1 = "This movie is fantastic! I really like it because it is so good!"
test_sample_2 = "Good movie!"
test_sample_3 = "Maybe I like this movie."
test_sample_4 = "Not not my taste, will skip and watch another movie"
test_sample_5 = "if you like action, then this movie might be good for you."
test_sample_6 = "Bad movie!"
test_sample_7 = "Not a good movie!"
test_sample_8 = "This movie really sucks! Can I get my money back please ?"
test_samples = [test_sample_1,test_sample_2,test_sample_3,test_sample_4,test_sample_5,test_sample_6,test_sample_7,test_sample_8]

test_samples_tokens = tokenizer_obj.texts_to_sequences(test_samples)
test_samples_tokens_pad = pad_sequences(test_samples_tokens,maxlen=max_length)

In [None]:
# predict 
model.predict(x=test_samples_tokens_pad)

# Train word2vec Embedding

In [None]:
import gensim
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [None]:
review_lines = list()
lines = df['review'].values.tolist()

In [None]:
for line in lines :
    tokens = word_tokenize(line)
    
    # Convert to lower case
    tokens = [w.lower() for w in tokens]
    
    # remove punctuation from each word
    table = str.maketrans('','', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    review_lines.append(words)

In [None]:
len(review_lines)

In [None]:
model = gensim.models.Word2Vec(sentences=review_lines, size=EMBEDDING_DIM, window=5, workers=4, min_count=1)

In [None]:
# Vocal size
words = list(model.wv.vocab)

In [None]:
print('vocabulary size : %d' % len(words))

In [None]:
model.wv.most_similar('horrible')

In [None]:
# Résultat sémentic à 2 vecteurs de mot (king - man + woman)
model.wv.most_similar_cosmul(positive=['woman', 'king'], negative=['man'])

In [None]:
# odd word out
print(model.wv.doesnt_match("woman king queen movie".split()))