## Open the dataset

In [28]:
# Importing the pandas library and aliasing it as 'pd'
import pandas as pd

# Reading a CSV file into a pandas DataFrame
df = pd.read_csv('C:\\Users\\himav\\4. Text Based Analysis\\TBA Assignment\\TopAnimatedImDb.csv')

# Sampling 10 random rows from the DataFrame and displaying them.
df.sample(10)

Unnamed: 0,Title,Rating,Votes,Gross,Genre,Metascore,Certificate,Director,Year,Description,Runtime
24,WolfWalkers,8.0,31618,,"Adventure, Family",87.0,UA,Tomm Moore,2020,['\nA young apprentice hunter and her father j...,103 min
47,Big Hero 6,7.8,456307,,"Action, Adventure",74.0,U,Don Hall,2014,['\nA special bond develops between plus-sized...,102 min
7,Mononoke-hime,8.4,388925,$2.38M,"Adventure, Fantasy",76.0,U,Hayao Miyazaki,1997,"[""\nOn a journey to find the cure for a Tatari...",134 min
2,Hotaru no haka,8.5,272469,,"Drama, War",94.0,U,Isao Takahata,1988,['\nA young boy and his little sister struggle...,89 min
70,"South Park: Bigger, Longer & Uncut",7.7,204110,$52.04M,"Comedy, Fantasy",73.0,A,Trey Parker,1999,['\nWhen Stan Marsh and his friends go see an ...,81 min
20,Ratatouille,8.1,721458,$206.45M,"Adventure, Comedy",96.0,U,Brad Bird,2007,['\nA rat who can cook makes an unusual allian...,111 min
28,Song of the Sea,8.0,58124,,"Adventure, Drama",85.0,PG,Tomm Moore,2014,"['\nBen, a young Irish boy, and his little sis...",93 min
55,Kubo and the Two Strings,7.7,128540,$48.02M,"Action, Adventure",84.0,U,Travis Knight,2016,['\nA young boy named Kubo must locate a magic...,101 min
52,Jûbê ninpûchô,7.8,37510,,"Action, Adventure",,A,Yoshiaki Kawajiri,1993,['\nA vagabond swordsman is aided by a beautif...,94 min
54,Majo no takkyûbin,7.8,143013,,"Adventure, Family",83.0,U,Hayao Miyazaki,1989,"['\nA young witch, on her mandatory year of in...",103 min


## Get the Bing Liu lexicon

In [29]:
# Importing preprocessing module from scikit-learn
from sklearn import preprocessing

# Importing the Natural Language Toolkit library
import nltk

# Downloading the opinion lexicon dataset from NLTK
nltk.download('opinion_lexicon')

# Importing the opinion lexicon dataset from NLTK
from nltk.corpus import opinion_lexicon

# Importing the word_tokenize function from NLTK
from nltk.tokenize import word_tokenize

# Printing the total number of words in the opinion lexicon dataset
print('Total number of words in opinion lexicon', len(opinion_lexicon.words()))

# Printing examples of positive words from the opinion lexicon dataset
print('Examples of positive words in opinion lexicon', opinion_lexicon.positive()[:10])

# Printing examples of negative words from the opinion lexicon dataset
print('Examples of negative words in opinion lexicon', opinion_lexicon.negative()[:10])

Total number of words in opinion lexicon 6789
Examples of positive words in opinion lexicon ['a+', 'abound', 'abounds', 'abundance', 'abundant', 'accessable', 'accessible', 'acclaim', 'acclaimed', 'acclamation']
Examples of negative words in opinion lexicon ['2-faced', '2-faces', 'abnormal', 'abolish', 'abominable', 'abominably', 'abominate', 'abomination', 'abort', 'aborted']


[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     C:\Users\himav\AppData\Roaming\nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!


## Create the "Dictionary"

In [30]:
#Let's create a dictionary which we can use for scoring our review text

# Downloading necessary NLTK data if not already downloaded
nltk.download('punkt')

# Assuming df is a DataFrame containing reviews and you're renaming the 'Description' column to 'text'
df.rename(columns={"Description": "text"}, inplace=True)

# Define positive and negative scores
pos_score = 1
neg_score = -1

# Create an empty dictionary to store word scores
word_dict = {}

# Adding the positive words to the dictionary with a pisitive score
for word in opinion_lexicon.positive():
    word_dict[word] = pos_score
    
# Adding the negative words to the dictionary with a negative score
for word in opinion_lexicon.negative():
    word_dict[word] = neg_score

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\himav\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Function to do the math

In [31]:
def bing_liu_score(text): 
    sentiment_score = 0 # Initializing sentiment score to zero
    bag_of_words = word_tokenize(text.lower()) # Tokenizing the text into words and converting to lowercase
    for word in bag_of_words: # Looping through each word in the bag of words
        if word in word_dict: # Checking if the word is in the sentiment dictionary
            sentiment_score += word_dict[word]  # If the word is in the dictionary, adding its sentiment score to the total score
    return sentiment_score # Returning the calculated sentiment score

## Make sure all texts are non-empty and are type text then call the function

In [32]:
# Filling any NaN values in the 'text' column with 'no review'
df['text'].fillna('no review', inplace=True)

# Applying the bing_liu_score function to each element in the 'text' column and creating a new column 'Bing_Liu_Score' to store the results
df['Bing_Liu_Score'] = df['text'].apply(bing_liu_score)

## Output

In [33]:
# Displaying the first 10 rows of selected columns ('Rating', 'text', 'Bing_Liu_Score') of the DataFrame
df[['Rating',"text", 'Bing_Liu_Score']].head(10)

Unnamed: 0,Rating,text,Bing_Liu_Score
0,8.6,"[""\nDuring her family's move to the suburbs, a...",-1
1,8.5,['\nLion prince Simba and his father are targe...,-1
2,8.5,['\nA young boy and his little sister struggle...,-1
3,8.4,['\nTwo strangers find themselves linked in a ...,-1
4,8.4,['\nTeen Miles Morales becomes the Spider-Man ...,-1
5,8.4,"[""\nAspiring musician Miguel, confronted with ...",0
6,8.4,"['\nIn the distant future, a small waste-colle...",0
7,8.4,"[""\nOn a journey to find the cure for a Tatari...",0
8,8.3,['\n78-year-old Carl Fredricksen travels to Pa...,0
9,8.3,"[""\nThe toys are mistakenly delivered to a day...",0


## Calculating mean

In [34]:
# Grouping the DataFrame by the 'Rating' column and calculating the mean of 'Bing_Liu_Score' for each group
df.groupby('Rating').agg({'Bing_Liu_Score':'mean'})

Unnamed: 0_level_0,Bing_Liu_Score
Rating,Unnamed: 1_level_1
7.6,0.0
7.7,-0.35
7.8,-0.909091
7.9,-0.166667
8.0,-0.428571
8.1,-1.333333
8.2,-3.0
8.3,0.333333
8.4,-0.4
8.5,-1.0
