<img src='https://training.dwit.edu.np/frontend/images/computer-training-institute.png'>
<h1>Data Science and Machine learning in Python</h1>
<h3>Instructor: <a href='https://www.kaggle.com/atishadhikari'> Atish Adhikari</a></h3>
<hr>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [11]:
import gensim
import nltk

In [6]:
data = pd.read_csv("datasets/imdb_labelled.txt", sep="\t", header=None, names=["sentence", "sentiment"])

In [7]:
data.head()

Unnamed: 0,sentence,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [23]:
X = []

stop_words = set(nltk.corpus.stopwords.words("english"))
punctuations = ["!", "?", ",", "'", '"', ";", ":", "."]


for comment in data["sentence"].values:
    comment = comment.lower()
    sentence = nltk.tokenize.sent_tokenize(comment)
    for sen in sentence:
        words = nltk.tokenize.word_tokenize(sen)
        #remove punctuations
        filtered_words = [w for w in words if w not in punctuations]
            
    X.append(filtered_words)    

In [32]:
word2vec = gensim.models.Word2Vec(sentences=X, size=100, window=5)

In [28]:
len(word2vec.wv.vocab)

320

In [123]:
word2vec.most_similar("bad")
#not well formed because of less data

  """Entry point for launching an IPython kernel.


[('that', 0.9997064471244812),
 ('this', 0.9996724128723145),
 ('of', 0.9996682405471802),
 ('is', 0.9996535778045654),
 ('in', 0.9996533393859863),
 ('the', 0.9996521472930908),
 ('for', 0.9996495842933655),
 ('i', 0.9996490478515625),
 ('and', 0.9996442198753357),
 ('to', 0.999636709690094)]

In [33]:
#convert to wordtovec as gensim only has word2vec-modal
gensim.scripts.glove2word2vec.glove2word2vec(glove_input_file="large_datasets/glove.6B.200d.txt", word2vec_output_file="large_datasets/word2vec.6B.200d.txt")

(400000, 200)

In [34]:
#load word2vev modal
word2vec_modal = gensim.models.KeyedVectors.load_word2vec_format("large_datasets/word2vec.6B.200d.txt")

In [127]:
word2vec_modal.most_similar("nepal")

[('bhutan', 0.7069970965385437),
 ('kathmandu', 0.702731728553772),
 ('nepali', 0.7026580572128296),
 ('nepalese', 0.6685097813606262),
 ('bangladesh', 0.6385263204574585),
 ('maoist', 0.6186525225639343),
 ('cambodia', 0.6165409088134766),
 ('india', 0.6137661933898926),
 ('laos', 0.6054975390434265),
 ('myanmar', 0.5960168838500977)]

In [36]:
word2vec_modal.most_similar(positive=["king", "woman"], negative=["man"])

[('queen', 0.6978678703308105),
 ('princess', 0.6081745028495789),
 ('monarch', 0.5889754891395569),
 ('throne', 0.5775108933448792),
 ('prince', 0.5750998854637146),
 ('elizabeth', 0.546359658241272),
 ('daughter', 0.5399125814437866),
 ('kingdom', 0.5318052768707275),
 ('mother', 0.5168544054031372),
 ('crown', 0.5164472460746765)]

#### Data Preparation

In [48]:
X = data["sentence"].values

In [49]:
X[0]

'A very, very, very slow-moving, aimless movie about a distressed, drifting young man.  '

In [76]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

#### Represent every word by unique numerical token

In [99]:
tokenizer = Tokenizer()

In [100]:
tokenizer.fit_on_texts(X)

In [128]:
#Preserve mapping of word -> token in a variable for later
word_index = tokenizer.word_index

In [102]:
X_tokenized = tokenizer.texts_to_sequences(X)

In [131]:
#orginal and tokenized form
print(X[0])
print(X_tokenized[0])

A very, very, very slow-moving, aimless movie about a distressed, drifting young man.  
[3, 28, 28, 28, 287, 407, 1216, 12, 37, 3, 1217, 1218, 408, 143]


In [103]:
#retrieving token for a word
word_index["very"]

28

In [70]:
comment_length = [len(x) for x in X_tokenized]

In [72]:
#checking length of each comments in imdb
for i in comment_length:
    print(i, end=",")

14,18,29,8,21,20,3,15,3,10,6,15,11,4,16,20,21,25,17,872,12,16,8,12,5,5,11,5,17,4,5,3,8,16,17,19,12,24,34,7,23,11,9,3,2,16,13,11,11,26,9,25,12,5,12,12,24,9,4,7,26,9,8,1,2,7,17,18,16,37,19,10,6,11,17,17,23,11,20,20,21,22,11,13,23,19,5,9,14,4,12,4,3,5,1,11,17,5,6,6,20,1,14,27,16,35,23,20,16,21,19,11,7,6,3,13,5,6,4,6,11,9,10,7,2,9,4,5,5,34,20,11,6,18,9,200,1400,45,28,8,5,11,16,6,19,10,7,7,6,302,3,15,6,11,3,12,24,20,9,29,22,14,18,13,7,4,18,8,29,15,12,17,7,12,23,21,13,43,13,25,31,19,21,10,16,15,10,13,15,16,17,29,55,7,44,16,33,15,20,16,13,12,12,16,21,17,11,4,3,5,7,9,12,5,9,8,12,36,11,4,4,14,3,69,43,15,20,21,8,15,57,8,36,31,22,24,12,34,5,33,15,35,8,20,15,12,8,14,20,14,9,25,5,8,13,6,7,16,16,12,12,20,22,11,24,16,7,10,9,9,18,45,5,12,19,20,15,10,44,6,11,6,19,25,34,24,28,15,13,23,24,21,24,12,7,2,26,33,12,8,15,11,10,20,10,8,12,11,11,14,8,20,19,4,6,15,13,6,21,25,16,8,7,14,24,22,31,9,14,28,8,9,11,4,16,18,9,14,24,18,5,8,11,27,33,20,18,19,21,25,9,26,11,10,6,19,19,5,15,16,7,32,21,7,4,6,12,13,5,19,13,23,9

#### Pad/Threshold all comments to a fixed size

In [75]:
#thresholding max comment length to 35
max_len = 35

In [78]:
#Padding the comments with less words, thresholding comments with many words
X_padded = pad_sequences(X_tokenized, maxlen=max_len)

In [133]:
print(X[0])
print()
print(X_padded[0])

A very, very, very slow-moving, aimless movie about a distressed, drifting young man.  

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    3   28   28   28  287  407 1216
   12   37    3 1217 1218  408  143]


#### Get word -> vector mapping from file

In [82]:
# Convert Glove file to embedding matrix 
#Get word -> vector mapping from file
embeddings_index = {}
f = open('large_datasets/glove.6B.200d.txt', encoding='UTF-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [134]:
#Get word -> vector mapping from file for sample 'very'
embeddings_index["very"]

array([ 5.1827e-01,  1.9440e-02,  8.9215e-03, -1.7083e-01, -1.5074e-01,
        4.4691e-02, -8.4464e-01,  3.3799e-01,  7.0777e-01, -2.4093e-01,
        4.0375e-01,  5.4616e-01, -1.8541e-01, -6.6753e-02,  3.8632e-01,
        5.3156e-01, -1.2843e-01,  3.7425e-01,  3.3350e-01, -5.2774e-01,
        2.4610e-01,  2.5388e+00, -8.1737e-02, -2.1850e-02,  2.8038e-01,
       -2.2604e-01,  7.2170e-02,  5.3669e-01,  9.0519e-02, -3.3273e-01,
        4.3203e-02,  3.1065e-02, -5.1937e-02,  3.0823e-01,  2.5642e-01,
       -3.4948e-01, -1.0253e+00, -5.8867e-01, -4.9620e-01,  3.8668e-01,
       -5.1096e-01, -7.8659e-02, -3.3842e-01,  5.1494e-01, -3.3330e-01,
       -5.6195e-01,  7.7807e-01,  2.1362e-01, -1.5577e-02, -1.7699e-01,
       -2.4518e-01, -2.1210e-01, -3.8734e-01,  4.8660e-01,  4.6362e-01,
       -3.1572e-01,  1.6519e-01, -2.6461e-01, -4.1045e-01, -3.9097e-01,
        3.2260e-01,  1.5388e-01, -2.8839e-01,  3.3369e-01, -2.6836e-01,
        2.4708e-01,  1.8982e-01,  4.2192e-01, -6.7222e-02,  3.26

#### Create a matrix with embedding vector at corresponding index

In [84]:
# word_index from earlier is used here....

#Uses 0 for unknown words and padded 0s, so size is vocab_size + 1
embedding_matrix = np.zeros((len(word_index) + 1, 200))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [85]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [87]:
rnn = Sequential()

#pass embedding matrix for embedding layer, make it non-trainable
rnn.add(Embedding(input_dim=(len(word_index) + 1), output_dim=200, input_length=max_len, 
                   weights=[embedding_matrix], trainable=False ))
        
rnn.add(LSTM(units=100))
rnn.add(Dense(units="50", activation="relu"))
rnn.add(Dense(units="1", activation="sigmoid"))

rnn.compile(loss="binary_crossentropy", metrics=["acc"], optimizer="adam")

In [89]:
y = data["sentiment"].values

In [91]:
rnn.fit(X_padded, y, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x150f4d2d978>

#### Test on random sample

In [118]:
sentences = [
    "very good movie",
    "worst movie ever"
]

#### Pre-process

In [119]:
sent_tokens = tokenizer.texts_to_sequences(sentences)

In [120]:
sentences_paded = pad_sequences(sent_tokens, maxlen=max_len)

In [121]:
sentences_paded

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,  28,  32,  12],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0, 176,  12,  65]])

#### Predict sentiment ( 1 => Good comment)

In [122]:
rnn.predict(sentences_paded)

array([[0.98274153],
       [0.02111399]], dtype=float32)