# Preprocessing

In [184]:
import sklearn
import pandas as pd
import numpy as np
import math
import nltk
import matplotlib.pyplot as plt

In [None]:
movie_by_review = pd.read_csv("reviewparagraphs.csv")

In [185]:
#Mapping of pos_tags to wordnet lemmatizer via 
#https://stackoverflow.com/questions/61982023/using-wordnetlemmatizer-lemmatize-with-pos-tags-throws-keyerror
from nltk.corpus import wordnet
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

In [186]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
lemmatizer = WordNetLemmatizer()
list = movie_by_review['text']

count = 0
text_clean = []

#for each tuple in df
for i in list:
    if(count % math.floor(len(list)/10) == 0):
        print('Progress: '+str(math.ceil(count*100/len(list)))+"%")
    count += 1
    
    #tokenizing
    tokens = nltk.word_tokenize(str(i))
    
    #pos-tagging
    tagged = nltk.pos_tag(tokens)
    lemmatized = []
    
    #lemmatizing with corresponding pos_tags
    for word, tag in tagged:
        wn_tag = get_wordnet_pos(tag)
        if wn_tag is None:
            lemmatized.append(word)
        else:
            lemmatized.append(lemmatizer.lemmatize(word, wn_tag))
       
    #removing stopwords
    stop_words = set(stopwords.words('english'))
    for w in lemmatized:
        if w in stop_words:
            lemmatized.remove(w)
            
    #remove less than 3 letters
    final_word_list = lemmatized.copy()
    #for word in final_word_list:
     #   if (len(word)<3):
     #       final_word_list.remove(word)
    
    #remove any remaining numbers/characters
    text_clean.append(' '.join([i for i in final_word_list if i.isalpha()]))  
    

#add column on data frame
movie_by_review['cleaned_text'] = text_clean

Progress: 0%
Progress: 10%
Progress: 20%
Progress: 30%
Progress: 40%
Progress: 50%
Progress: 60%
Progress: 70%
Progress: 80%
Progress: 90%


In [190]:
movie_by_review.head(5)

Unnamed: 0.1,Unnamed: 0,fold_id,cv_tag,html_id,sent_id,text,tag,cleaned_text
0,0,0,cv000,29590,0,films adapted from comic books have had plenty...,pos,film adapt comic book plenty success whether s...
1,25,0,cv001,18431,0,every now and then a movie comes along from a ...,pos,every movie come along suspect studio every in...
2,64,0,cv002,15918,0,you've got mail works alot better than it dese...,pos,get mail work alot good deserve order make fil...
3,83,0,cv003,11664,0,""" jaws "" is a rare film that grabs your attent...",pos,jaw rare film grab attention show single image...
4,125,0,cv004,11636,0,moviemaking is a lot like being the general ma...,pos,moviemaking lot like general manager nfl team ...


In [192]:
from sklearn.model_selection import train_test_split             
from tensorflow.keras.preprocessing.text import Tokenizer                    
from tensorflow.keras.preprocessing.sequence import pad_sequences

sentences = movie_by_review['cleaned_text'].values
y = np.where(movie_by_review['tag'] == "pos", '1','0')


In [194]:
y

array(['1', '1', '1', ..., '0', '0', '0'], dtype='<U1')

In [195]:

sentences_train,sentences_test,y_train,y_test = train_test_split(
                                                sentences, y,  
                                                test_size=0.20,  
                                                random_state=265)

y_train = [str(x) for x in y_train]
y_test = [str(x) for x in y_test]


In [196]:

tokenizer = Tokenizer(num_words=31347)
tokenizer.fit_on_texts(sentences_train)


In [197]:

X_train = tokenizer.texts_to_sequences(sentences_train)
X_test = tokenizer.texts_to_sequences(sentences_test)


In [198]:

# Adding 1 because of  reserved 0 index
vocab_size = len(tokenizer.word_index) + 1                          

maxlen = 500

X_train = pad_sequences(X_train, padding='pre', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='pre', maxlen=maxlen)

In [199]:
import numpy as np

def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  
    # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath) as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word] 
                embedding_matrix[idx] = np.array(vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix

In [200]:
embedding_dim = 100
embedding_matrix = create_embedding_matrix('data/glove_word_embeddings/glove.6B.50d.txt',tokenizer.word_index,embedding_dim)

In [201]:
y_train = [int(x) for x in y_train]
y_test = [int(x) for x in y_test]
y_train = np.array(y_train)
y_test = np.array(y_test)

In [202]:
from keras.models import Sequential
from keras import layers
embedding_dim = 100

model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
history = model.fit(X_train, y_train,
                    epochs=10,
                    validation_data=(X_test, y_test),
                    batch_size=50)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
