In [1]:
import gensim
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import itertools
import csv
from gensim.models import KeyedVectors
from sklearn.metrics import mean_squared_error
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

/kaggle/input/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin.gz
/kaggle/input/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin
/kaggle/input/d/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv
/kaggle/input/imdb-dataset-of-50k-movie-reviews/__results__.html
/kaggle/input/imdb-dataset-of-50k-movie-reviews/__resultx__.html
/kaggle/input/imdb-dataset-of-50k-movie-reviews/__notebook__.ipynb
/kaggle/input/imdb-dataset-of-50k-movie-reviews/__output__.json
/kaggle/input/imdb-dataset-of-50k-movie-reviews/custom.css
/kaggle/input/glove840b300dtxt/glove.840B.300d.txt
/kaggle/input/product-reviews-5000/product_reviews_5000.txt


In [43]:
# upload data set that will be used for the training and testing examples
file_path = "/kaggle/input/d/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv"
df = pd.read_csv(file_path)
reviews = df['review'].tolist() # add reviews and labels to seperate lists
labels = df['sentiment'].tolist() 

num_train = 3000 # of training examples
num_test = 3000 # of testing examples
train_labels = labels[:num_train] # seperate data for testing and training
train_reviews = reviews[:num_train]
test_labels = labels[num_train:num_train+num_test]
test_reviews = reviews[num_train:num_train+num_test]

for j in range(num_train):
    if train_labels[j] == 'positive':
        train_labels[j] = 1
    else:
        train_labels[j] = 0

for j in range(num_test):
    if test_labels[j] == 'positive':
        test_labels[j] = 1
    else:
        test_labels[j] = 0
        
train_labels = np.array(train_labels)
train_labels = train_labels.reshape((num_train,1))

test_labels = np.array(test_labels)
test_labels = test_labels.reshape((num_test,1))

In [44]:
# begin preprocessing training data for the neural net
lower_train = [word.lower() for word in train_reviews]
punc_train = [word.translate(str.maketrans('','',string.punctuation)) for word in lower_train]
tokenized_train = [word_tokenize(word) for word in punc_train]

In [45]:
# preprocessing for the testing data
lower_test = [word.lower() for word in test_reviews]
punc_test = [word.translate(str.maketrans('','',string.punctuation)) for word in lower_test]
tokenized_test = [word_tokenize(word) for word in punc_test]

In [46]:
# function for padding the reviews
def pad_sentences(tokenized_text, max_len, padding_val='<pad>'):
    padded_texts = []
    for tokens in tokenized_text:
        if len(tokens) > max_len:
            padded_tokens = tokens[:max_len]
        else:
            padded_tokens = tokens + [padding_val] * (max_len - len(tokens))
        padded_texts.append(padded_tokens)
    return padded_texts

In [48]:
max_len = 200
padded_train = pad_sentences(tokenized_train, max_len, padding_val='<pad>')
padded_test = pad_sentences(tokenized_test, max_len, padding_val='<pad>')

In [15]:
# upload word2vec dictionary to encode the data for neural net
word_vectors = KeyedVectors.load_word2vec_format("/kaggle/input/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin",binary=True)

In [16]:
def sentence_to_embedding(sentence,word_vectors,embedding_dim=300):
    embeddings = [word_vectors[word] for word in sentence if word in word_vectors]
    if not embeddings:
        return np.zeros(embedding_dim)
    return np.mean(embeddings,axis=0)

In [58]:
#emcode training data
embedded_train = [sentence_to_embedding(sentence,word_vectors,embedding_dim=300) 
                  for sentence in padded_train]
embedded_train = np.array(embedded_train)
embedded_train = embedded_train.T

In [59]:
#encode testinh data 
embedded_test = [sentence_to_embedding(sentence,word_vectors,embedding_dim=300) 
                 for sentence in padded_test]
embedded_test = np.array(embedded_test)
embedded_test = embedded_test.T

In [112]:
def init_params():
    w1 = np.random.randn(100,300)*.1
    b1 = np.zeros((100,1))
    w2 = np.random.randn(1,100)*.1
    b2 = np.zeros((1,1))
    return w1,b1,w2,b2

def ReLU(Z):
    return np.maximum(0,Z)

def sigmoid(Z):
    return 1 / (1 + np.exp(-Z))

def forward_prop(w1,b1,w2,b2,X):
    z1 = w1.dot(X)+b1
    a1 = ReLU(z1)
    z2 = w2.dot(a1)+b2
    a2 = sigmoid(z2)
    return z1,a1,z2,a2

def deriv_ReLU(Z):
    return Z > 0

def back_prop(z1,a1,z2,a2,w1,b1,w2,b2,X,Y,num_train):
    dz2 = (a2 - Y.T) * a2 * (1 - a2)
    dw2 = (1/num_train) * dz2.dot(a1.T)
    db2 = (1/num_train) * np.sum(dz2)
    dz1 = w2.T.dot(dz2) * deriv_ReLU(z1)
    dw1 = (1/num_train) * dz1.dot(X.T)
    db1 = (1/num_train) * np.sum(dz1)
    return dw1,db1,dw2,db2
    
def update_params(w1,b1,w2,b2,dw1,db1,dw2,db2,alpha):
    w1 = w1 - alpha * dw1
    b1 = b1 - alpha * db1
    w2 = w2 - alpha * dw2
    b2 = b2 - alpha * db2
    return w1,b1,w2,b2

In [111]:
def gradient_descent(x_train,y_train,iterations,alpha):
    w1,b1,w2,b2 = init_params()
    for i in range(iterations):
        z1,a1,z2,a2 = forward_prop(w1,b1,w2,b2,x_train)
        dw1,db1,dw2,db2 = back_prop(z1,a1,z2,a2,w1,b1,w2,b2,x_train,y_train,num_train)
        w1,b1,w2,b2 = update_params(w1,b1,w2,b2,dw1,db1,dw2,db2,alpha)
        if (i % 1000 == 0):
            mse = mean_squared_error(y_train.T,a2)
            print("iterations: ",i)
            print("error: ",mse)
    return w1, b1, w2, b2

In [62]:
def make_prediction(X,w1,b1,w2,b2):
    _, _, _, a2 = forward_prop(w1,b1,w2,b2,X)
    if a2 > 0.5:
        prediction = 'positive'
    else:
        prediction = 'negative'
    return prediction

In [63]:
def select_test_example(embedded_test,test_reviews,test_labels):
    k = np.random.randint(1500)
    embedded_example = embedded_test[:,k]
    embedded_example = embedded_example.reshape((300,1))
    current_label = test_labels[k]
    current_review = test_reviews[k]
    return embedded_example, current_label, current_review
    

In [113]:
iterations = 40000
alpha = 0.15
w1,b1,w2,b2 = gradient_descent(embedded_train,train_labels,iterations,alpha)

iterations:  0
error:  0.24985697236016813
iterations:  1000
error:  0.24289781719057954
iterations:  2000
error:  0.2280350676918345
iterations:  3000
error:  0.20272166116578524
iterations:  4000
error:  0.17617054783920427
iterations:  5000
error:  0.15684095016465074
iterations:  6000
error:  0.14464321950693151
iterations:  7000
error:  0.1367423272425751
iterations:  8000
error:  0.1312133066459932
iterations:  9000
error:  0.12705609607378854
iterations:  10000
error:  0.12375947782160639
iterations:  11000
error:  0.12104248752306423
iterations:  12000
error:  0.11873940393485305
iterations:  13000
error:  0.11674581550469501
iterations:  14000
error:  0.11498984455345336
iterations:  15000
error:  0.11342375936574284
iterations:  16000
error:  0.11201032195383626
iterations:  17000
error:  0.11072491431397088
iterations:  18000
error:  0.10954715212241478
iterations:  19000
error:  0.108460078874121
iterations:  20000
error:  0.10745154269880607
iterations:  21000
error:  0.10

In [68]:
def test_net(embedded_test,test_reviews,test_labels):
    embedded_example, current_label, current_review = select_test_example(embedded_test,test_reviews,test_labels)
    prediction = make_prediction(embedded_example,w1,b1,w2,b2)
    print("prediction: ",prediction)
    print("actual label: ",current_label)
    print(current_review)

In [106]:
test_net(embedded_test,test_reviews,test_labels)

prediction:  positive
actual label:  [1]
While a bit preachy on the topic of progress as the saving grace of mankind, this is still a stunning film that presages the science-fiction special effects blockbusters that would take another 40 years to arrive on the silver screen. It predicts the global chaos of WWII, but expands on the premise by having the conflict last 30 years, and then tells the epic tale of man's struggle out from under the rubble and into the wilds of space. The acting seems wooden and strangely sterile, but this is perhaps a result of its contrast with the visuals which must have been utterly breathtaking at the time of the movie's release, and which still impress today. This is a film not to be missed by anyone at all interested in the SF genre.


In [108]:
test_net(embedded_test,test_reviews,test_labels)

prediction:  negative
actual label:  [0]
BEGIN SPOILER: Fitfully funny and memorable for Mr. Chong's literal roach-smoking scene: Chong coolly mashes a stray kitchen cockroach into his pipe's bowl, lights up, coughs and hacks violently for a seeming eternity,then with perfect aplomb and not skipping a beat, re-loads the bowl properly, re-lights, re-tokes. END SPOILER. Alas, I began to lose faith less than half-way through the proceedings. It occurred to me that the lackadaisical duo are way obnoxious and less than relatable. I have come to appreciate the relative sophistication of contemporary stoners, Harold and Kumar. I simply prefer brighter company. Yet, the movie is probably a perfect fit for baked frat bros or those viewers who are so feeble-minded as to be outwitted by a stoner when they-- the former are sober. Notable guest appearance by Paul Reubens spouting obscenities in pre-Pee-wee form.


In [109]:
test_net(embedded_test,test_reviews,test_labels)

prediction:  positive
actual label:  [1]
The Impossible Planet and The Satan Pit together comprise the two best episodes of the 'new' Doctor Who's second season. Having said that, it should be obvious that much of the story basically transposes the plot of Quatermass and the Pit (1967) to an outer space setting, with the history of the universe intertwined with that of the Beast 666. These episodes cement the emotional ties between Rose and the Doctor, whilst also highlighting Rose's increasing self-confidence, establishing her as a not-quite-equal-yet-but-getting-there partner with our beloved Time Lord. Also of note is Matt Jones elegant screenplay, which decreases the occasional over-reliance on one-liners for the Doctor, and the performances of the entire cast, most notably the excellent Shaun Parkes as acting Captain Zachary Cross Flane.


In [110]:
test_net(embedded_test,test_reviews,test_labels)

prediction:  negative
actual label:  [0]
I was looking forward to this flick. Being an old Robert E Howard fan, mainly from a Conan stand-point. <br /><br />I was not expecting a great deal and thought they could not mess it up too much.... Oh dear - how wrong was I....<br /><br />The main flaw was it was fairly dull. It needed to zip along with a nice helping of supernatural goings-on, sword-fights and the like.<br /><br />You got some gore, but everything else was just pretty life-less. The middle section just seemed to involve 40 minutes in a muddy forest with slow plodding horse-drawn carts and even slower dialogue and character development!<br /><br />On the plus side = Costumes and effects were fine, but not enough to keep your interest.<br /><br />I think it would have been better to tone down the gore, up the tempo, and go for a 12A rating. As a Ten Year old boy, I may have liked this movie. Probably about the age I was first reading the Conan stories funny enough. Perhaps that