In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv("imdb_master.csv",encoding="ISO-8859-1")

In [3]:
df=df.replace("neg", 0)
df=df.replace("pos", 1)
df.head()

Unnamed: 0.1,Unnamed: 0,type,review,label,file
0,0,test,Once again Mr. Costner has dragged out a movie...,0,0_2.txt
1,1,test,This is an example of why the majority of acti...,0,10000_4.txt
2,2,test,"First of all I hate those moronic rappers, who...",0,10001_1.txt
3,3,test,Not even the Beatles could write songs everyon...,0,10002_3.txt
4,4,test,Brass pictures (movies is not a fitting word f...,0,10003_3.txt


In [4]:
traindf=df[df["type"]=="train"]
testdf=df[df["type"]=="test"]
traindf=traindf[["review","label"]]
testdf=testdf[["review","label"]]
print(len(traindf))
print(len(testdf))

75000
25000


In [5]:
X_train=traindf["review"].values
y_train=traindf["label"].values
X_test=testdf["review"].values
y_test=testdf["label"].values

In [6]:
words = set([])
for s in X_train:
    for w in s:
        words.add(w.lower())
word2index = {w: i + 2 for i, w in enumerate(list(words))}
word2index['-PAD-'] = 0  # The special value used for padding
word2index['-OOV-'] = 1  # The special value used for OOVs

In [7]:
import tensorflow as tf
import keras
from keras.preprocessing.sequence import pad_sequences
from gensim.models import Word2Vec
import multiprocessing
import os

total_reviews=len(traindf)+len(testdf)

Using TensorFlow backend.


In [11]:
#Declare Model Parameters
cbow = 0
skipgram = 1
EMB_DIM = 150 #more dimensions, more computationally expensive to train
min_word_count = 1
workers = multiprocessing.cpu_count() #based on computer cpu count
context_size = 7
downsampling = 1e-3
learning_rate = 0.025 #initial learning rate
min_learning_rate = 0.025 #fixated learning rate
num_epoch = 15

In [12]:
w2v = Word2Vec(
    sg = skipgram,
    hs = 1, #hierarchical softmax
    size = EMB_DIM,
    min_count = min_word_count, 
    workers = workers,
    window = context_size, 
    sample = downsampling, 
    alpha = learning_rate, 
    min_alpha = min_learning_rate
)

In [13]:
w2v.build_vocab(X_train)
w2v.train(X_train,epochs=5,total_examples=w2v.corpus_count)
words = list(w2v.wv.vocab)
print('Vocabulary size: %d' % len(words))
# save model in ASCII (word2vec) format
filename = 'embedding_word2vec.txt'
w2v.wv.save_word2vec_format(filename, binary=False)

embeddings_index={}
f=open(os.path.join('','embedding_word2vec.txt '),encoding="utf-8")
for line in f:
    values=line.split()
    word=values[0]
    coefs=np.asarray(values[1:])
    embeddings_index[word]=coefs
f.close()


KeyboardInterrupt: 

In [None]:
train_sentences_X, test_sentences_X = [], []

num_words=len(word2index)+1
embedding_matrix=np.zeros((num_words,EMB_DIM))
print(word2index)
for word,i in word2index.items():
    if i>num_words:
        continue
    embedding_vector=embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i]=embedding_vector

for s in X_train:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
 
    train_sentences_X.append(s_int)
    
for s in X_test:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
 
    test_sentences_X.append(s_int)

