In [1]:
import pandas as pd
import numpy as np
import nltk

# nltk.download('vader_lexicon')

## **Sentiment Analysis for users reviews dataset**

In [2]:
from functions import load_jsongz

# Loading compressed data
data = load_jsongz('./data/user_reviews_c.json.gz', mode='rt', encoding='UTF-8')
# Creating a pandas datframe with it
df_revs = pd.DataFrame(data)
print(df_revs.info())
df_revs.head()

Number of records: 58430
Item type: <class 'dict'>
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58430 entries, 0 to 58429
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   user_id    58430 non-null  object
 1   posted     58430 non-null  object
 2   item_id    58430 non-null  object
 3   recommend  58430 non-null  bool  
 4   review     58430 non-null  object
dtypes: bool(1), object(4)
memory usage: 1.8+ MB
None


Unnamed: 0,user_id,posted,item_id,recommend,review
0,76561197970982479,"Posted November 5, 2011.",1250,True,Simple yet with great replayability. In my opi...
1,76561197970982479,"Posted July 15, 2011.",22200,True,It's unique and worth a playthrough.
2,76561197970982479,"Posted April 21, 2011.",43110,True,Great atmosphere. The gunplay can be a bit chu...
3,js41637,"Posted June 24, 2014.",251610,True,I know what you think when you see this title ...
4,js41637,"Posted September 8, 2013.",227300,True,For a simple (it's actually not all that simpl...


### Preprocessing text - Createing new labeled column.

In [3]:
# Sentiment analyser
from nltk.sentiment import SentimentIntensityAnalyzer

def label_sentiment(text:str, analyser:SentimentIntensityAnalyzer):
    # Get polarity score 
    scores = analyser.polarity_scores(text)
    comp = scores['compound']
    # Defining threshold for labeling
    label = None
    if comp < (-0.05): 
        label = 0
    elif comp > (-0.05) and comp < 0.05: 
        label = 1
    elif comp >= 0.1: 
        label = 2

    # print(f'Sentence: {text}\n',f'scores: {scores}')
    return label
# Creating the new column with label 
# {0: Negative, 1: Neutral, 2: Positive}

In [10]:
# Instance VADER sentiment analyser from nltk
vader = SentimentIntensityAnalyzer()

# New labeld column
df_revs['sentiment'] = df_revs['review'].apply(label_sentiment, analyser=vader)
df_revs['sentiment']

0        2.0
1        2.0
2        2.0
3        2.0
4        2.0
        ... 
58425    2.0
58426    2.0
58427    2.0
58428    2.0
58429    2.0
Name: sentiment, Length: 58430, dtype: float64

In [11]:
df_revs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58430 entries, 0 to 58429
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   user_id    58430 non-null  object 
 1   posted     58430 non-null  object 
 2   item_id    58430 non-null  object 
 3   recommend  58430 non-null  bool   
 4   review     58430 non-null  object 
 5   sentiment  57993 non-null  float64
dtypes: bool(1), float64(1), object(4)
memory usage: 2.3+ MB


In [14]:
df_revs['sentiment'].fillna(value=1, inplace=True)

In [15]:
df_revs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58430 entries, 0 to 58429
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   user_id    58430 non-null  object 
 1   posted     58430 non-null  object 
 2   item_id    58430 non-null  object 
 3   recommend  58430 non-null  bool   
 4   review     58430 non-null  object 
 5   sentiment  58430 non-null  float64
dtypes: bool(1), float64(1), object(4)
memory usage: 2.3+ MB


In [16]:
df_revs[df_revs['review'] == '']

Unnamed: 0,user_id,posted,item_id,recommend,review,sentiment
3088,2ZESTY4ME,Posted March 11.,550,True,,1.0
4590,76561198093337643,"Posted September 19, 2014.",550,True,,1.0
15685,terencemok,"Posted December 30, 2014.",218620,True,,1.0
20037,76561197971285616,Posted March 10.,378041,True,,1.0
21580,shez13,"Posted May 23, 2014.",211820,True,,1.0
21581,shez13,"Posted May 23, 2014.",227320,True,,1.0
22435,damo4lyf,"Posted September 2, 2014.",620,True,,1.0
23055,FastAsACheetah,"Posted September 9, 2013.",239660,True,,1.0
24759,76561198098017317,"Posted April 16, 2015.",730,True,,1.0
24764,76561198072940782,"Posted March 16, 2013.",570,True,,1.0


``NaN`` and ``empty`` values were replaced by 1 (neutral)