In [1]:
import pandas as pd
import numpy as np
import nltk

# nltk.download('vader_lexicon')

# Parent folder MUST be added.
# Otherwise data folder will not be found.
import sys
sys.path.append('../')

## **Sentiment Analysis for users reviews dataset**

In [None]:
df_revs = pd.read_json('../data/user_reviews_c.json.gz', compression='gzip', lines=True)
print(df_revs.info())
df_revs.head()

### Preprocessing text - Createing new labeled column.

In [13]:
# Sentiment analyser
from nltk.sentiment import SentimentIntensityAnalyzer

def label_sentiment(text:str, analyser:SentimentIntensityAnalyzer):
    # Get polarity score 
    scores = analyser.polarity_scores(text)
    comp = scores['compound']
    # Defining threshold for labeling
    label = None
    if comp < (-0.05): 
        label = 0
    elif comp > (-0.05) and comp < 0.05: 
        label = 1
    elif comp >= 0.1: 
        label = 2

    # print(f'Sentence: {text}\n',f'scores: {scores}')
    return label
# Creating the new column with label 
# {0: Negative, 1: Neutral, 2: Positive}

In [14]:
# Instance VADER sentiment analyser from nltk
vader = SentimentIntensityAnalyzer()

# New labeld column
df_revs['sentiment'] = df_revs['review'].apply(label_sentiment, analyser=vader)
df_revs['sentiment']

0        2.0
1        2.0
2        2.0
3        2.0
4        2.0
        ... 
58425    2.0
58426    2.0
58427    2.0
58428    2.0
58429    2.0
Name: sentiment, Length: 58430, dtype: float64

In [15]:
df_revs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58430 entries, 0 to 58429
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   user_id    58430 non-null  object 
 1   item_id    58430 non-null  int64  
 2   recommend  58430 non-null  bool   
 3   review     58430 non-null  object 
 4   sentiment  57993 non-null  float64
dtypes: bool(1), float64(1), int64(1), object(2)
memory usage: 1.8+ MB


Replace NaN values for neutral sentiment = 1

In [16]:
df_revs['sentiment'].fillna(value=1, inplace=True)

In [17]:
df_revs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58430 entries, 0 to 58429
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   user_id    58430 non-null  object 
 1   item_id    58430 non-null  int64  
 2   recommend  58430 non-null  bool   
 3   review     58430 non-null  object 
 4   sentiment  58430 non-null  float64
dtypes: bool(1), float64(1), int64(1), object(2)
memory usage: 1.8+ MB


In [18]:
df_revs[df_revs['review'] == ''].head()

Unnamed: 0,user_id,item_id,recommend,review,sentiment
3088,2ZESTY4ME,550,True,,1.0
4590,76561198093337643,550,True,,1.0
15685,terencemok,218620,True,,1.0
20037,76561197971285616,378041,True,,1.0
21580,shez13,211820,True,,1.0


``NaN`` and ``empty`` values were replaced by 1 (neutral)

Saving and Removing the old file

In [19]:
# Final version of dataset without review column
columns = ['user_id', 'item_id', 'recommend', 'sentiment']

# Saving final consumible version of the dataset
# from functions import gzip_json_file
# gzip_json_file(
#     path= './data/reviews.json.gz',
#     df= df_revs,
#     subset= columns
# )

# Saving csv format
df_revs[columns].to_csv('../data/reviews.csv.gz', compression='gzip', index=False)

# Deleting previous reviews file
import os
os.remove('../data/user_reviews_c.json.gz')