In [1]:
import pandas as pd

In [71]:
data = pd.read_csv(r"data/IMDB Dataset.csv", nrows=20)
data.shape

(20, 2)

In [38]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [8]:
data[data['review'].duplicated()]

Unnamed: 0,review,sentiment
3537,Quite what the producers of this appalling ada...,negative
3769,My favourite police series of all time turns t...,positive
4391,"Beautiful film, pure Cassavetes style. Gena Ro...",positive


In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     5000 non-null   object
 1   sentiment  5000 non-null   object
dtypes: object(2)
memory usage: 78.3+ KB


## Text Preprocessing

In [72]:
# Lower casing
data['review'] = data['review'].str.lower()
data.head(3)

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive


In [73]:
# Removing HTML tags
import re
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r"",text)
data['review'] = data['review'].apply(remove_html_tags)
data.review.head()

0    one of the other reviewers has mentioned that ...
1    a wonderful little production. the filming tec...
2    i thought this was a wonderful way to spend ti...
3    basically there's a family where a little boy ...
4    petter mattei's "love in the time of money" is...
Name: review, dtype: object

In [74]:
# Removing URLS
import re
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r"", text)
data['review'] = data['review'].apply(remove_urls)

In [75]:
# Removing punctuation
import string
exclude = string.punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('','',exclude))
data['review'] =  data['review'].apply(remove_punctuation)

In [76]:
# spelling correction
from textblob import TextBlob

def correct_spell(text):
    textblob = TextBlob(text)
    return str(TextBlob(text).correct())


In [77]:
data['review'] = data['review'].apply(correct_spell)
data.review.head(8)

0    one of the other reviews has mentioned that af...
1    a wonderful little production the filling tech...
2    i thought this was a wonderful way to spend ti...
3    basically there a family where a little boy ja...
4    letter matters love in the time of money is a ...
5    probably my alliee favorite movie a story of h...
6    i sure would like to see a resurrection of a u...
7    this show was an amazing fresh  innovative ide...
Name: review, dtype: object

In [78]:
# removing stop words
from nltk.corpus import stopwords
def remove_stopwords(text):
    new_text = []
    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append("")
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return " ".join(x)

data['review'] = data['review'].apply(remove_stopwords)
data['review'].head(10)

0    one    reviews  mentioned   watching  1 oz epi...
1     wonderful little production  filling techniqu...
2     thought    wonderful way  spend time    hot s...
3    basically   family   little boy jake thinks   ...
4    letter matters love   time  money   usually st...
5    probably  alliee favorite movie  story  helple...
6     sure would like  see  resurrection    dated s...
7     show   amazing fresh innovative idea      fir...
8    encouraged   positive comments   film     look...
9      like original gut wrenching laughter   like ...
Name: review, dtype: object

In [79]:
# Tokenization
from nltk.tokenize import word_tokenize
data['tokens'] = data['review'].apply(word_tokenize)
data.head()

Unnamed: 0,review,sentiment,tokens
0,one reviews mentioned watching 1 oz epi...,positive,"[one, reviews, mentioned, watching, 1, oz, epi..."
1,wonderful little production filling techniqu...,positive,"[wonderful, little, production, filling, techn..."
2,thought wonderful way spend time hot s...,positive,"[thought, wonderful, way, spend, time, hot, su..."
3,basically family little boy jake thinks ...,negative,"[basically, family, little, boy, jake, thinks,..."
4,letter matters love time money usually st...,positive,"[letter, matters, love, time, money, usually, ..."


In [80]:
# Stemming
from nltk.stem import PorterStemmer
ps = PorterStemmer()

def stemmer(tokens):
    return [ps.stem(word) for word in tokens]

data['tokens'] = data['tokens'].apply(stemmer)
data.head()

Unnamed: 0,review,sentiment,tokens
0,one reviews mentioned watching 1 oz epi...,positive,"[one, review, mention, watch, 1, oz, episod, h..."
1,wonderful little production filling techniqu...,positive,"[wonder, littl, product, fill, techniqu, assum..."
2,thought wonderful way spend time hot s...,positive,"[thought, wonder, way, spend, time, hot, summe..."
3,basically family little boy jake thinks ...,negative,"[basic, famili, littl, boy, jake, think, combi..."
4,letter matters love time money usually st...,positive,"[letter, matter, love, time, money, usual, stu..."


## Feature Extraction/ Text Representation

In [81]:
# Number of words in the corpus
corpus = []
for i in data['tokens']:
    corpus.extend(i)
    
print(len(corpus))

1595


In [82]:
# Vocabulary
vocab = list(set(corpus))
print(len(vocab))

923


### Bag of Words

In [83]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [84]:
bow = cv.fit_transform(data['review'])

In [96]:
print(list(cv.vocabulary_.keys()))

['one', 'reviews', 'mentioned', 'watching', 'oz', 'episode', 'hooked', 'right', 'exactly', 'happened', 'metre', 'first', 'thing', 'struck', 'brutally', 'unflinching', 'scenes', 'violence', 'set', 'word', 'go', 'trust', 'show', 'faint', 'hearted', 'timid', 'pulls', 'punched', 'regards', 'drugs', 'sex', 'hardware', 'classic', 'use', 'words', 'called', 'nickname', 'given', 'onward', 'maximum', 'security', 'state', 'penitentiary', 'focused', 'mainly', 'emerald', 'city', 'experimental', 'section', 'prison', 'cells', 'glass', 'fronts', 'face', 'inwards', 'privacy', 'high', 'agenda', 'em', 'home', 'manyaryans', 'muslin', 'gangstas', 'nations', 'christians', 'italians', 'irish', 'snuffles', 'death', 'stares', 'podgy', 'dealings', 'shady', 'agreements', 'never', 'far', 'away', 'would', 'say', 'main', 'appeal', 'due', 'fact', 'goes', 'shows', 'dare', 'forget', 'pretty', 'pictures', 'painted', 'mainstream', 'audiences', 'charm', 'romance', 'mess', 'around', 'ever', 'saw', 'nasty', 'surrey', 'read

In [89]:
bow[0].toarray()

array([[0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [98]:
# Sum the columns of the BoW matrix to get word counts
word_counts = bow.toarray().sum(axis=0)

# Map the words to their total counts
word_frequency = {word: word_counts[index] for word, index in cv.vocabulary_.items()}

# Print the word frequency
print("Word Frequency:", word_frequency)

Word Frequency: {'one': 25, 'reviews': 1, 'mentioned': 1, 'watching': 5, 'oz': 5, 'episode': 2, 'hooked': 1, 'right': 6, 'exactly': 3, 'happened': 1, 'metre': 1, 'first': 10, 'thing': 4, 'struck': 2, 'brutally': 1, 'unflinching': 1, 'scenes': 5, 'violence': 4, 'set': 2, 'word': 1, 'go': 7, 'trust': 1, 'show': 9, 'faint': 1, 'hearted': 1, 'timid': 1, 'pulls': 1, 'punched': 1, 'regards': 1, 'drugs': 1, 'sex': 2, 'hardware': 1, 'classic': 1, 'use': 3, 'words': 2, 'called': 2, 'nickname': 1, 'given': 1, 'onward': 1, 'maximum': 1, 'security': 1, 'state': 1, 'penitentiary': 1, 'focused': 1, 'mainly': 1, 'emerald': 1, 'city': 3, 'experimental': 1, 'section': 1, 'prison': 3, 'cells': 1, 'glass': 1, 'fronts': 1, 'face': 1, 'inwards': 1, 'privacy': 1, 'high': 3, 'agenda': 2, 'em': 1, 'home': 2, 'manyaryans': 1, 'muslin': 1, 'gangstas': 1, 'nations': 1, 'christians': 1, 'italians': 1, 'irish': 1, 'snuffles': 1, 'death': 1, 'stares': 1, 'podgy': 1, 'dealings': 1, 'shady': 1, 'agreements': 1, 'neve