In [4]:
import pandas as pd
import string as st
import re
import nltk
from nltk import PorterStemmer, WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\heraj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\heraj\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
data = pd.read_csv(r"C:\Users\heraj\OneDrive\Desktop\Dataset (1).csv")
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [11]:
data.shape

(50000, 2)

In [12]:
# Remove all punctuations from the text

def remove_punct(text):
    return ("".join([ch for ch in text if ch not in st.punctuation]))

In [13]:
data['removed_punc'] = data['review'].apply(lambda x: remove_punct(x))
data.head()

Unnamed: 0,review,sentiment,removed_punc
0,One of the other reviewers has mentioned that ...,positive,One of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...,positive,A wonderful little production br br The filmin...
2,I thought this was a wonderful way to spend ti...,positive,I thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...,negative,Basically theres a family where a little boy J...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,Petter Matteis Love in the Time of Money is a ...


In [14]:
# Convert text to lower case tokens. Here, split() is applied on white-spaces. 

def tokenize(text):
    text = re.split('\s+' ,text)
    return [x.lower() for x in text]

In [15]:
data['tokens'] = data['removed_punc'].apply(lambda msg : tokenize(msg))
data.head()

Unnamed: 0,review,sentiment,removed_punc,tokens
0,One of the other reviewers has mentioned that ...,positive,One of the other reviewers has mentioned that ...,"[one, of, the, other, reviewers, has, mentione..."
1,A wonderful little production. <br /><br />The...,positive,A wonderful little production br br The filmin...,"[a, wonderful, little, production, br, br, the..."
2,I thought this was a wonderful way to spend ti...,positive,I thought this was a wonderful way to spend ti...,"[i, thought, this, was, a, wonderful, way, to,..."
3,Basically there's a family where a little boy ...,negative,Basically theres a family where a little boy J...,"[basically, theres, a, family, where, a, littl..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,Petter Matteis Love in the Time of Money is a ...,"[petter, matteis, love, in, the, time, of, mon..."


In [16]:
# Remove tokens of length less than 3
def remove_small_words(text):
    return [x for x in text if len(x) > 3 ]

In [17]:
data['larger_tokens'] = data['tokens'].apply(lambda x : remove_small_words(x))
data.head()

Unnamed: 0,review,sentiment,removed_punc,tokens,larger_tokens
0,One of the other reviewers has mentioned that ...,positive,One of the other reviewers has mentioned that ...,"[one, of, the, other, reviewers, has, mentione...","[other, reviewers, mentioned, that, after, wat..."
1,A wonderful little production. <br /><br />The...,positive,A wonderful little production br br The filmin...,"[a, wonderful, little, production, br, br, the...","[wonderful, little, production, filming, techn..."
2,I thought this was a wonderful way to spend ti...,positive,I thought this was a wonderful way to spend ti...,"[i, thought, this, was, a, wonderful, way, to,...","[thought, this, wonderful, spend, time, summer..."
3,Basically there's a family where a little boy ...,negative,Basically theres a family where a little boy J...,"[basically, theres, a, family, where, a, littl...","[basically, theres, family, where, little, jak..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,Petter Matteis Love in the Time of Money is a ...,"[petter, matteis, love, in, the, time, of, mon...","[petter, matteis, love, time, money, visually,..."


In [18]:
# Remove stopwords.
def remove_stopwords(text):
    return [word for word in text if word not in nltk.corpus.stopwords.words('english')]

In [19]:
data['clean_tokens'] = data['larger_tokens'].apply(lambda x : remove_stopwords(x))
data.head()

Unnamed: 0,review,sentiment,removed_punc,tokens,larger_tokens,clean_tokens
0,One of the other reviewers has mentioned that ...,positive,One of the other reviewers has mentioned that ...,"[one, of, the, other, reviewers, has, mentione...","[other, reviewers, mentioned, that, after, wat...","[reviewers, mentioned, watching, episode, youl..."
1,A wonderful little production. <br /><br />The...,positive,A wonderful little production br br The filmin...,"[a, wonderful, little, production, br, br, the...","[wonderful, little, production, filming, techn...","[wonderful, little, production, filming, techn..."
2,I thought this was a wonderful way to spend ti...,positive,I thought this was a wonderful way to spend ti...,"[i, thought, this, was, a, wonderful, way, to,...","[thought, this, wonderful, spend, time, summer...","[thought, wonderful, spend, time, summer, week..."
3,Basically there's a family where a little boy ...,negative,Basically theres a family where a little boy J...,"[basically, theres, a, family, where, a, littl...","[basically, theres, family, where, little, jak...","[basically, theres, family, little, jake, thin..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,Petter Matteis Love in the Time of Money is a ...,"[petter, matteis, love, in, the, time, of, mon...","[petter, matteis, love, time, money, visually,...","[petter, matteis, love, time, money, visually,..."


In [20]:
# Apply stemming to get root words 
def stemming(text):
    ps = PorterStemmer()
    return [ps.stem(word) for word in text]

In [21]:
data['stem_words'] = data['clean_tokens'].apply(lambda wrd: stemming(wrd))
data.head()

Unnamed: 0,review,sentiment,removed_punc,tokens,larger_tokens,clean_tokens,stem_words
0,One of the other reviewers has mentioned that ...,positive,One of the other reviewers has mentioned that ...,"[one, of, the, other, reviewers, has, mentione...","[other, reviewers, mentioned, that, after, wat...","[reviewers, mentioned, watching, episode, youl...","[review, mention, watch, episod, youll, hook, ..."
1,A wonderful little production. <br /><br />The...,positive,A wonderful little production br br The filmin...,"[a, wonderful, little, production, br, br, the...","[wonderful, little, production, filming, techn...","[wonderful, little, production, filming, techn...","[wonder, littl, product, film, techniqu, unass..."
2,I thought this was a wonderful way to spend ti...,positive,I thought this was a wonderful way to spend ti...,"[i, thought, this, was, a, wonderful, way, to,...","[thought, this, wonderful, spend, time, summer...","[thought, wonderful, spend, time, summer, week...","[thought, wonder, spend, time, summer, weekend..."
3,Basically there's a family where a little boy ...,negative,Basically theres a family where a little boy J...,"[basically, theres, a, family, where, a, littl...","[basically, theres, family, where, little, jak...","[basically, theres, family, little, jake, thin...","[basic, there, famili, littl, jake, think, the..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,Petter Matteis Love in the Time of Money is a ...,"[petter, matteis, love, in, the, time, of, mon...","[petter, matteis, love, time, money, visually,...","[petter, matteis, love, time, money, visually,...","[petter, mattei, love, time, money, visual, st..."


In [22]:
# Apply lemmatization on tokens
def lemmatize(text):
    word_net = WordNetLemmatizer()
    return [word_net.lemmatize(word) for word in text]

In [23]:
data['lemma_words'] = data['clean_tokens'].apply(lambda x : lemmatize(x))
data.head()

Unnamed: 0,review,sentiment,removed_punc,tokens,larger_tokens,clean_tokens,stem_words,lemma_words
0,One of the other reviewers has mentioned that ...,positive,One of the other reviewers has mentioned that ...,"[one, of, the, other, reviewers, has, mentione...","[other, reviewers, mentioned, that, after, wat...","[reviewers, mentioned, watching, episode, youl...","[review, mention, watch, episod, youll, hook, ...","[reviewer, mentioned, watching, episode, youll..."
1,A wonderful little production. <br /><br />The...,positive,A wonderful little production br br The filmin...,"[a, wonderful, little, production, br, br, the...","[wonderful, little, production, filming, techn...","[wonderful, little, production, filming, techn...","[wonder, littl, product, film, techniqu, unass...","[wonderful, little, production, filming, techn..."
2,I thought this was a wonderful way to spend ti...,positive,I thought this was a wonderful way to spend ti...,"[i, thought, this, was, a, wonderful, way, to,...","[thought, this, wonderful, spend, time, summer...","[thought, wonderful, spend, time, summer, week...","[thought, wonder, spend, time, summer, weekend...","[thought, wonderful, spend, time, summer, week..."
3,Basically there's a family where a little boy ...,negative,Basically theres a family where a little boy J...,"[basically, theres, a, family, where, a, littl...","[basically, theres, family, where, little, jak...","[basically, theres, family, little, jake, thin...","[basic, there, famili, littl, jake, think, the...","[basically, there, family, little, jake, think..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,Petter Matteis Love in the Time of Money is a ...,"[petter, matteis, love, in, the, time, of, mon...","[petter, matteis, love, time, money, visually,...","[petter, matteis, love, time, money, visually,...","[petter, mattei, love, time, money, visual, st...","[petter, matteis, love, time, money, visually,..."


In [24]:
# Create sentences to get clean text as input for vectors

def return_sentences(tokens):
    return " ".join([word for word in tokens])

In [25]:
data['clean_text'] = data['lemma_words'].apply(lambda x : return_sentences(x))
data.head()

Unnamed: 0,review,sentiment,removed_punc,tokens,larger_tokens,clean_tokens,stem_words,lemma_words,clean_text
0,One of the other reviewers has mentioned that ...,positive,One of the other reviewers has mentioned that ...,"[one, of, the, other, reviewers, has, mentione...","[other, reviewers, mentioned, that, after, wat...","[reviewers, mentioned, watching, episode, youl...","[review, mention, watch, episod, youll, hook, ...","[reviewer, mentioned, watching, episode, youll...",reviewer mentioned watching episode youll hook...
1,A wonderful little production. <br /><br />The...,positive,A wonderful little production br br The filmin...,"[a, wonderful, little, production, br, br, the...","[wonderful, little, production, filming, techn...","[wonderful, little, production, filming, techn...","[wonder, littl, product, film, techniqu, unass...","[wonderful, little, production, filming, techn...",wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,positive,I thought this was a wonderful way to spend ti...,"[i, thought, this, was, a, wonderful, way, to,...","[thought, this, wonderful, spend, time, summer...","[thought, wonderful, spend, time, summer, week...","[thought, wonder, spend, time, summer, weekend...","[thought, wonderful, spend, time, summer, week...",thought wonderful spend time summer weekend si...
3,Basically there's a family where a little boy ...,negative,Basically theres a family where a little boy J...,"[basically, theres, a, family, where, a, littl...","[basically, theres, family, where, little, jak...","[basically, theres, family, little, jake, thin...","[basic, there, famili, littl, jake, think, the...","[basically, there, family, little, jake, think...",basically there family little jake think there...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,Petter Matteis Love in the Time of Money is a ...,"[petter, matteis, love, in, the, time, of, mon...","[petter, matteis, love, time, money, visually,...","[petter, matteis, love, time, money, visually,...","[petter, mattei, love, time, money, visual, st...","[petter, matteis, love, time, money, visually,...",petter matteis love time money visually stunni...
