In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import re
from nltk.stem import PorterStemmer
import numpy as np

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
#Importing data 
data = pd.read_csv('imdb_train.csv')
test = pd.read_csv('imdb_test_without_labels.csv')

In [None]:
data.head()

Unnamed: 0,id,review,sentiment
0,34006,"We all create our own reality, or do we? That ...",1
1,10965,dear god where do i begin. this is bar none th...,1
2,9471,(spoilers)<br /><br />I was blown away by this...,1
3,18392,"Do not see this film. In most cases, such as t...",0
4,22981,The only reason I'm giving this a 9 is that th...,1


In [None]:
test.head()

Unnamed: 0,id,review
0,33527,A must see movie for anyone who ever went to c...
1,23058,The world of the Dragon Hunters is a 3D gravit...
2,21058,"As I was watching this movie I was thinking,OK..."
3,31732,...this is a classic with so many great dialog...
4,17285,This 1986 Italian-French remake of the 1946 fi...


In [None]:
data_review = data[["review"]]
test_review = test[["review"]]

In [None]:
data_review.head()

Unnamed: 0,review
0,"We all create our own reality, or do we? That ..."
1,dear god where do i begin. this is bar none th...
2,(spoilers)<br /><br />I was blown away by this...
3,"Do not see this film. In most cases, such as t..."
4,The only reason I'm giving this a 9 is that th...


In [None]:
test_review.head()

Unnamed: 0,review
0,A must see movie for anyone who ever went to c...
1,The world of the Dragon Hunters is a 3D gravit...
2,"As I was watching this movie I was thinking,OK..."
3,...this is a classic with so many great dialog...
4,This 1986 Italian-French remake of the 1946 fi...


Lowercasing


In [None]:
#for train
data_review["review"] = data_review["review"].str.lower()
#for test
test_review["review"] = test_review["review"].str.lower()


Remove Extra Whitespaces

In [None]:
def remove_whitespace(text):
    return  " ".join(text.split())

#aplly to train
data_review["review"] = data_review["review"].apply(remove_whitespace)
#apply to test
test_review["review"] = test_review["review"].apply(remove_whitespace)

In [None]:
test_review.head()

Unnamed: 0,review
0,a must see movie for anyone who ever went to c...
1,the world of the dragon hunters is a 3d gravit...
2,"as i was watching this movie i was thinking,ok..."
3,...this is a classic with so many great dialog...
4,this 1986 italian-french remake of the 1946 fi...


Tokenization


In [None]:
nltk.download('punkt')
data_review["review"] = data_review["review"].apply(lambda X: word_tokenize(X))
test_review["review"] = test_review["review"].apply(lambda X: word_tokenize(X))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
test_review.head()

Unnamed: 0,review
0,"[a, must, see, movie, for, anyone, who, ever, ..."
1,"[the, world, of, the, dragon, hunters, is, a, ..."
2,"[as, i, was, watching, this, movie, i, was, th..."
3,"[..., this, is, a, classic, with, so, many, gr..."
4,"[this, 1986, italian-french, remake, of, the, ..."


Removing Stopwords

In [None]:
nltk.download('stopwords')
en_stopwords = stopwords.words('english')
en_stopwords.append('br')
def remove_stopwords(text):
    result = []
    for token in text:
        if token not in en_stopwords:
            result.append(token)
    return result

data_review["review"] = data_review["review"].apply(remove_stopwords)
test_review["review"] = test_review["review"].apply(remove_stopwords)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
test_review.head()

Unnamed: 0,review
0,"[must, see, movie, anyone, ever, went, camp, ,..."
1,"[world, dragon, hunters, 3d, gravity, challeng..."
2,"[watching, movie, thinking, ,, ok, 'll, get, g..."
3,"[..., classic, many, great, dialogs, scenes, n..."
4,"[1986, italian-french, remake, 1946, film, nam..."


Removing Punctuations

In [None]:
def remove_punct(text):
    
    tokenizer = RegexpTokenizer(r"\w+")
    lst=tokenizer.tokenize(' '.join(text))
    return lst

data_review["review"] = data_review["review"].apply(remove_punct)
test_review["review"] = test_review["review"].apply(remove_punct)


In [None]:
test_review.head()

Unnamed: 0,review
0,"[must, see, movie, anyone, ever, went, camp, w..."
1,"[world, dragon, hunters, 3d, gravity, challeng..."
2,"[watching, movie, thinking, ok, ll, get, good,..."
3,"[classic, many, great, dialogs, scenes, nobody..."
4,"[1986, italian, french, remake, 1946, film, na..."


Small words Removal

In [None]:
def remove_small_words_and_numbers(text):
    result = []
    for token in text:
        if len(token)>2 and not(token.isdigit()): 
            result.append(token)
    return result

In [None]:
data_review["review"] = data_review["review"].apply(remove_small_words_and_numbers)
test_review["review"] = test_review["review"].apply(remove_small_words_and_numbers)

In [None]:
test_review.head()

Unnamed: 0,review
0,"[must, see, movie, anyone, ever, went, camp, w..."
1,"[world, dragon, hunters, gravity, challenged, ..."
2,"[watching, movie, thinking, get, good, moment,..."
3,"[classic, many, great, dialogs, scenes, nobody..."
4,"[italian, french, remake, film, name, turns, h..."


In [None]:
test_batches=np.array_split(test_review, 10)
data_batches=np.array_split(data_review, 100)

Stemming

In [None]:
 def stemming(text):
     porter = PorterStemmer()
     result=[]
     for word in text:
         result.append(porter.stem(word))
     return result



In [None]:
for i in range(10):
  test_batches[i]["review"] = test_batches[i]["review"].apply(stemming)



In [None]:
for i in range(100):
    data_batches[i]["review"] = data_batches[i]["review"].apply(stemming)


In [None]:
test_batches[0].head()

Unnamed: 0,review
0,"[must, see, movi, anyon, ever, went, camp, wan..."
1,"[world, dragon, hunter, graviti, challeng, wor..."
2,"[watch, movi, think, get, good, moment, wrong,..."
3,"[classic, mani, great, dialog, scene, nobodi, ..."
4,"[italian, french, remak, film, name, turn, hea..."


In [None]:
test_batches[1].head()

Unnamed: 0,review
1000,"[movi, classic, part, countri, film, small, to..."
1001,"[disney, bill, paxton, fine, job, convey, stor..."
1002,"[hyster, thing, movi, accord, director, diffic..."
1003,"[expect, see, lot, bath, alt, oklar, promis, t..."
1004,"[terribl, way, could, even, begin, consid, fun..."


In [None]:
test_batches[1].shape

(1000, 1)

In [None]:
processed_test_list = []
for i in range(10):
   processed_test_list.append(test_batches[i])
processed_test = pd.concat(processed_test_list)


In [None]:
processed_test.shape

(10000, 1)

In [None]:
processed_test.head()

Unnamed: 0,review
0,"[must, see, movi, anyon, ever, went, camp, wan..."
1,"[world, dragon, hunter, graviti, challeng, wor..."
2,"[watch, movi, think, get, good, moment, wrong,..."
3,"[classic, mani, great, dialog, scene, nobodi, ..."
4,"[italian, french, remak, film, name, turn, hea..."


In [None]:
processed_data_list = []
for i in range(100):
   processed_data_list.append(data_batches[i])
processed_data = pd.concat(processed_data_list)


In [None]:
processed_data.shape

(40000, 1)

In [None]:
def backtostring(text):
    result=' '.join(text)
    return result

processed_test["review"] = processed_test["review"].apply(backtostring)
processed_data["review"] = processed_data["review"].apply(backtostring)


In [None]:
processed_test.head()

Unnamed: 0,review
0,must see movi anyon ever went camp want film c...
1,world dragon hunter graviti challeng world pla...
2,watch movi think get good moment wrong real be...
3,classic mani great dialog scene nobodi miss ni...
4,italian french remak film name turn heat earli...


In [None]:
test['review']=processed_test['review']
data['review']=processed_data['review']


In [None]:
test.head() 

Unnamed: 0,id,review
0,33527,must see movi anyon ever went camp want film c...
1,23058,world dragon hunter graviti challeng world pla...
2,21058,watch movi think get good moment wrong real be...
3,31732,classic mani great dialog scene nobodi miss ni...
4,17285,italian french remak film name turn heat earli...


In [None]:
data.head()

Unnamed: 0,id,review,sentiment
0,34006,creat realiti core question behind highli orig...,1
1,10965,dear god begin bar none best movi ever seen ca...,1
2,9471,spoiler blown away movi rent movielink bit dec...,1
3,18392,see film case grudg japanes film infinit bette...,0
4,22981,reason give kid actor play tadashi tormentor j...,1


In [None]:
test.to_csv('processed_test.csv', index=False)
data.to_csv('processed_data.csv', index=False)
