In [1]:
import pandas as pd

In [36]:
data = pd.read_csv(r"data/IMDB Dataset.csv", nrows=100)
data.shape

(100, 2)

In [38]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [8]:
data[data['review'].duplicated()]

Unnamed: 0,review,sentiment
3537,Quite what the producers of this appalling ada...,negative
3769,My favourite police series of all time turns t...,positive
4391,"Beautiful film, pure Cassavetes style. Gena Ro...",positive


In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     5000 non-null   object
 1   sentiment  5000 non-null   object
dtypes: object(2)
memory usage: 78.3+ KB


## Text Preprocessing

In [39]:
# Lower casing
data['review'] = data['review'].str.lower()
data.head(3)

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive


In [40]:
# Removing HTML tags
import re
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r"",text)
data['review'] = data['review'].apply(remove_html_tags)
data.review.head()

0    one of the other reviewers has mentioned that ...
1    a wonderful little production. the filming tec...
2    i thought this was a wonderful way to spend ti...
3    basically there's a family where a little boy ...
4    petter mattei's "love in the time of money" is...
Name: review, dtype: object

In [41]:
# Removing URLS
import re
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r"", text)
data['review'] = data['review'].apply(remove_urls)

In [42]:
# Removing punctuation
import string
exclude = string.punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('','',exclude))
data['review'] =  data['review'].apply(remove_punctuation)

In [43]:
# spelling correction
from textblob import TextBlob

def correct_spell(text):
    textblob = TextBlob(text)
    return str(TextBlob(text).correct())


In [None]:
data['review'] = data['review'].apply(correct_spell)
data.review.head(8)

In [18]:
# removing stop words
from nltk.corpus import stopwords
def remove_stopwords(text):
    new_text = []
    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append("")
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return " ".join(x)

data['review'] = data['review'].apply(remove_stopwords)
data['review'].head(10)

KeyboardInterrupt: 

In [None]:
# Tokenization
from nltk.tokenize import word_tokenize
data['review'] = data['review'].apply(word_tokenize)
data.head()

In [None]:
# Stemming
from nltk.stem import PorterStemmer
ps = PorterStemmer()

def stemmer(tokens):
    return [ps.stem(word) for word in tokens]

data['review'] = data['review'].apply(stemmer)
data.head()