In [1]:
import pandas as pd
eng_data = pd.read_csv("../data/IMDB Dataset.csv")

In [2]:
eng_data.shape

(50000, 2)

In [3]:
eng_data.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
from bs4 import BeautifulSoup
from nltk.tokenize.toktok import ToktokTokenizer
import re
import nltk

def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

def remove_special_characters(text, remove_digits = True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text

def remove_stopwords(text, is_lower_case = False):
    tokenizer = ToktokTokenizer()
    stopword_list = nltk.corpus.stopwords.words('english')
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

def text_cleaning(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    text = remove_special_characters(text, remove_digits = True)
    text = remove_stopwords(text, is_lower_case = False)
    return text

In [5]:
eng_data["review"][2]

'I thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air conditioned theater and watching a light-hearted comedy. The plot is simplistic, but the dialogue is witty and the characters are likable (even the well bread suspected serial killer). While some may be disappointed when they realize this is not Match Point 2: Risk Addiction, I thought it was proof that Woody Allen is still fully in control of the style many of us have grown to love.<br /><br />This was the most I\'d laughed at one of Woody\'s comedies in years (dare I say a decade?). While I\'ve never been impressed with Scarlet Johanson, in this she managed to tone down her "sexy" image and jumped right into a average, but spirited young woman.<br /><br />This may not be the crown jewel of his career, but it was wittier than "Devil Wears Prada" and more interesting than "Superman" a great comedy to go see with friends.'

In [6]:
strip_html(eng_data["review"][2])

'I thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air conditioned theater and watching a light-hearted comedy. The plot is simplistic, but the dialogue is witty and the characters are likable (even the well bread suspected serial killer). While some may be disappointed when they realize this is not Match Point 2: Risk Addiction, I thought it was proof that Woody Allen is still fully in control of the style many of us have grown to love.This was the most I\'d laughed at one of Woody\'s comedies in years (dare I say a decade?). While I\'ve never been impressed with Scarlet Johanson, in this she managed to tone down her "sexy" image and jumped right into a average, but spirited young woman.This may not be the crown jewel of his career, but it was wittier than "Devil Wears Prada" and more interesting than "Superman" a great comedy to go see with friends.'

In [7]:
eng_data["review"] = eng_data["review"].apply(text_cleaning)

In [8]:
eng_data["review"][1]

'wonderful little production filming technique unassuming oldtimeBBC fashion gives comforting sometimes discomforting sense realism entire piece actors extremely well chosen Michael Sheen got polari voices pat truly see seamless editing guided references Williams diary entries well worth watching terrificly written performed piece masterful production one great masters comedy life realism really comes home little things fantasy guard rather use traditional dream techniques remains solid disappears plays knowledge senses particularly scenes concerning Orton Halliwell sets particularly flat Halliwells murals decorating every surface terribly well done'

In [9]:
from nltk.tokenize import word_tokenize
vocab_lst = [word_tokenize(x) for x in eng_data["review"]]

In [10]:
from collections import Counter
vocab_lst2 = [y for x in vocab_lst for y in x]
Counter(vocab_lst2).most_common(10)

[('movie', 82310),
 ('film', 73514),
 ('one', 46301),
 ('like', 37483),
 ('good', 27403),
 ('would', 23751),
 ('time', 22741),
 ('really', 22207),
 ('see', 21765),
 ('even', 21494)]

In [11]:
vocab_lst3 = list(Counter(vocab_lst2).keys())
vocab_to_index = {word: index for index, word in enumerate(vocab_lst3)}
index_to_vocab = {index: word for index, word in enumerate(vocab_lst3)}

In [12]:
index_to_vocab

{0: 'One',
 1: 'reviewers',
 2: 'mentioned',
 3: 'watching',
 4: '1',
 5: 'Oz',
 6: 'episode',
 7: 'youll',
 8: 'hooked',
 9: 'right',
 10: 'exactly',
 11: 'happened',
 12: 'meThe',
 13: 'first',
 14: 'thing',
 15: 'struck',
 16: 'brutality',
 17: 'unflinching',
 18: 'scenes',
 19: 'violence',
 20: 'set',
 21: 'word',
 22: 'GO',
 23: 'Trust',
 24: 'show',
 25: 'faint',
 26: 'hearted',
 27: 'timid',
 28: 'pulls',
 29: 'punches',
 30: 'regards',
 31: 'drugs',
 32: 'sex',
 33: 'hardcore',
 34: 'classic',
 35: 'use',
 36: 'wordIt',
 37: 'called',
 38: 'OZ',
 39: 'nickname',
 40: 'given',
 41: 'Oswald',
 42: 'Maximum',
 43: 'Security',
 44: 'State',
 45: 'Penitentary',
 46: 'focuses',
 47: 'mainly',
 48: 'Emerald',
 49: 'City',
 50: 'experimental',
 51: 'section',
 52: 'prison',
 53: 'cells',
 54: 'glass',
 55: 'fronts',
 56: 'face',
 57: 'inwards',
 58: 'privacy',
 59: 'high',
 60: 'agenda',
 61: 'Em',
 62: 'home',
 63: 'manyAryans',
 64: 'Muslims',
 65: 'gangstas',
 66: 'Latinos',
 67: 'C

In [13]:
vocab_to_index

{'One': 0,
 'reviewers': 1,
 'mentioned': 2,
 'watching': 3,
 '1': 4,
 'Oz': 5,
 'episode': 6,
 'youll': 7,
 'hooked': 8,
 'right': 9,
 'exactly': 10,
 'happened': 11,
 'meThe': 12,
 'first': 13,
 'thing': 14,
 'struck': 15,
 'brutality': 16,
 'unflinching': 17,
 'scenes': 18,
 'violence': 19,
 'set': 20,
 'word': 21,
 'GO': 22,
 'Trust': 23,
 'show': 24,
 'faint': 25,
 'hearted': 26,
 'timid': 27,
 'pulls': 28,
 'punches': 29,
 'regards': 30,
 'drugs': 31,
 'sex': 32,
 'hardcore': 33,
 'classic': 34,
 'use': 35,
 'wordIt': 36,
 'called': 37,
 'OZ': 38,
 'nickname': 39,
 'given': 40,
 'Oswald': 41,
 'Maximum': 42,
 'Security': 43,
 'State': 44,
 'Penitentary': 45,
 'focuses': 46,
 'mainly': 47,
 'Emerald': 48,
 'City': 49,
 'experimental': 50,
 'section': 51,
 'prison': 52,
 'cells': 53,
 'glass': 54,
 'fronts': 55,
 'face': 56,
 'inwards': 57,
 'privacy': 58,
 'high': 59,
 'agenda': 60,
 'Em': 61,
 'home': 62,
 'manyAryans': 63,
 'Muslims': 64,
 'gangstas': 65,
 'Latinos': 66,
 'Chris

In [14]:
len(vocab_to_index)

256140

In [16]:
result = [[vocab_to_index[word] for word in y] for y in vocab_lst]

In [17]:
result[10]

[624,
 625,
 184,
 626,
 586,
 627,
 628,
 102,
 629,
 630,
 194,
 631,
 632,
 13,
 633,
 92,
 537,
 299,
 634,
 635,
 398,
 636,
 629,
 537,
 637,
 638,
 639,
 310,
 640,
 78,
 641,
 92,
 284,
 233,
 642,
 643,
 644,
 645,
 310,
 81,
 84,
 646,
 647,
 648,
 649,
 325,
 650,
 651,
 652,
 373,
 653]

In [18]:
# KOREAN : https://github.com/e9t/nsmc
# ENGLISH : https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews/kernels