# Sentiment Algorithm

Here we gonna do an algorithm that show the most frequent and significant words for a positive review (4 to 5 stars) and negative review (0 to 2)

In [204]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Luan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Luan\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [6]:
import pandas as pd
import numpy as np
import string
import re
from nltk.corpus import stopwords # Need to install
from nltk.corpus import words as words_nltk

In [7]:
data = pd.read_csv('dataset/Reviews_t.csv')

data = data[(data['Score']<3) | (data['Score']>3)]

Getting the frequency of words in positive reviews and negative reviews

In [8]:
low_dict = {}
high_dict = {}

stop_words = set(stopwords.words('english')) # Preset to remove "stop words" like "before, and, i, again"
english_words = set(words_nltk.words()) # Preset to remove words that are not present in the english language
translator = str.maketrans('', '', string.punctuation) # Preset to remove punctuation

for index, row in data.iterrows():
    r_text = row['Text']

    r_text = r_text.translate(translator) # Remove punctuation
    words = np.array(r_text.split())

    r_text = re.sub(r'\d+', '', r_text) # Remove any number

    mask = np.vectorize(lambda w: w.lower() not in stop_words)(words)
    words = words[mask]

    mask = np.vectorize(lambda w: w.lower() in english_words)(words)
    words = words[mask]
    
    words = np.char.capitalize(words)
    
    low = row['Score'] < 3
    
    for w in words:
        w = w.capitalize()
        if low:
            low_dict[w] = low_dict.get(w, 0) + 1 # Similar to "setdefault"
        else:
            high_dict[w] = high_dict.get(w, 0) + 1

Getting the most significant words using a ratio methods, the lower the value, the most it appear only in negative reviews. The bigger the value, the most it appear in positive reviews.

In [9]:
word_scores = {}
all_words = set(low_dict.keys()).union(high_dict.keys())

for word in all_words:
    low_count = low_dict.get(word, 0)
    high_count = high_dict.get(word, 0)

    score = (high_count + 1) / (low_count + 1)  # Frequency ratio
    word_scores[word] = score

Creating a pd.Series

In [10]:
low_dict
low_df = pd.DataFrame(list(low_dict.items()), columns=['Word', 'CountForNegative'])
high_df = pd.DataFrame(list(high_dict.items()), columns=['Word', 'CountForPositive'])
all_df = pd.DataFrame(list(word_scores.items()), columns=['Word', 'WordRatio'])

combined = pd.merge(low_df, high_df, on='Word', how='outer')
combined = combined.fillna(0)
combined = pd.merge(combined, all_df, on='Word', how='left')

combined['TotalCount'] = combined['CountForPositive'] + combined['CountForNegative']

sentiment_df = combined[['Word', 'CountForPositive', 'CountForNegative', 'TotalCount', 'WordRatio']] # Reorder

Finally, here we have the final dataframe with all the words frequency for positive or negative.

In [21]:
sentiment_df.index.name = 'Word_Id'
sentiment_df['WordRatio'] = sentiment_df['WordRatio'].round(4)
sentiment_df

Unnamed: 0_level_0,Word,CountForPositive,CountForNegative,TotalCount,WordRatio
Word_Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Aa,78.0,17.0,95.0,4.3889
1,Aardvark,1.0,0.0,1.0,2.0000
2,Aback,69.0,8.0,77.0,7.7778
3,Abalone,3.0,2.0,5.0,1.3333
4,Abandon,45.0,11.0,56.0,3.8333
...,...,...,...,...,...
27201,Zorro,1.0,0.0,1.0,2.0000
27202,Zoster,1.0,0.0,1.0,2.0000
27203,Zowie,11.0,0.0,11.0,12.0000
27204,Zucchini,179.0,24.0,203.0,7.2000


Saving the dataframe to csv...

In [23]:
sentiment_df.to_csv('dataset/Reviews_sentiment.csv')

In [25]:
sentiment_df.describe()

Unnamed: 0,CountForPositive,CountForNegative,TotalCount,WordRatio
count,27206.0,27206.0,27206.0,27206.0
mean,493.513012,103.338418,596.85143,4.878683
std,3802.516381,773.706104,4523.34188,6.090526
min,0.0,0.0,1.0,0.005
25%,2.0,0.0,2.0,2.0
50%,8.0,2.0,10.0,3.0
75%,54.0,13.0,69.0,6.0
max,182182.0,43673.0,225855.0,158.0
