In [52]:
import numpy as np

In [53]:
import pandas as pd

In [54]:
import ast

In [55]:
# Loading the scraped data set
df = pd.read_csv('../data/data_set_v1.csv')

In [56]:
# Checking general information related to the loaded dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110 entries, 0 to 109
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Title         110 non-null    object
 1   User Reviews  110 non-null    object
dtypes: object(2)
memory usage: 1.8+ KB


In [57]:
# could use an in-depth description if we have duplicates
df.describe()

Unnamed: 0,Title,User Reviews
count,110,110
unique,87,105
top,Interstellar,"['Just wow', 'Ah yes. My first existential cri..."
freq,3,2


In [58]:
# As a matter of fact we have duplicates and am going to remove them in place
df.drop_duplicates('Title',inplace=True)

In [59]:
# Can we get a second hand description of the dataframe now that we removed duplicates
df.describe()

Unnamed: 0,Title,User Reviews
count,87,87
unique,87,87
top,Fast X,"['Excruciatingly Awful', 'What Happened?', 'Th..."
freq,1,1


In [60]:
# Now we do an info check to seee if reviews rows match title rows
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 87 entries, 0 to 109
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Title         87 non-null     object
 1   User Reviews  87 non-null     object
dtypes: object(2)
memory usage: 2.0+ KB


In [61]:
# Casting user reviews to python lists from string in order to achieve aspect level design
user_reviews_data = df['User Reviews'].apply(ast.literal_eval).explode().to_list()

In [62]:
df = pd.DataFrame(user_reviews_data, columns=['User Reviews'])
df.to_csv('../data/data_set_v2.csv', index=False)

In [107]:
# Using Regular Expression to filter special characters
import re

In [63]:
# Tokenizing 
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [65]:
# Performing tokenization at aspect level for each sentence 
token_list = [word_tokenize(sentence) for sentence in df['User Reviews']]

In [64]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/hilario/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/hilario/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/hilario/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [66]:
# Creating a set of stop words in english
stop_words = set(stopwords.words("english"))

In [67]:
# removing stop words and maintaining list structure
def remove_stopwords(input_sentence_list):
    return [word for word in input_sentence_list if word.casefold() not in stop_words]    

In [89]:
output_data = [remove_stopwords(sentence) for sentence in token_list]

In [90]:
df = pd.DataFrame({'User Reviews': output_data})

In [91]:
df.to_pickle('../data/data_set_v3.pkl')

In [92]:
# Stemming
from nltk.stem import PorterStemmer

In [98]:
def stem_words(input_sentence_list):
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in input_sentence_list]

In [99]:
df = pd.read_pickle('../data/data_set_v3.pkl')

In [95]:
stemmed_data = [stem_words(sentence) for sentence in df['User Reviews']]

In [105]:
df = pd.DataFrame({'Stemmed User Reviews' : stemmed_data})

In [106]:
# Saving data as pickle to maintain list structure
df.to_pickle('../data/data_set_v4.pkl')
# Saving as csv for visualization
df.to_csv('../data/data_set_v4.csv', index=False)

In [None]:
# Part of Speech Tagging