In [7]:
import pandas as pd
import utils as ut
from textblob import TextBlob

#### Here, we will perform the requested transformation, where the 'review' column in the 'user_reviews' dataset is replaced by a new column called 'sentiment_analysis'.

In [8]:
path = '../data/generated/reviews.csv'

df_reviews = pd.read_csv(path)
df_reviews.head(2)

Unnamed: 0,item_id,recommend,last_edited,review,funny,helpful,posted
0,1250,True,Not specified,Simple yet with great replayability. In my opi...,Not specified,No ratings yet,"Posted November 5, 2011."
1,251610,True,Not specified,I know what you think when you see this title ...,Not specified,15 of 20 people (75%) found this review helpful,"Posted June 24, 2014."


In [9]:
ut.data_overview(df_reviews)


Total rows:  20823

Total full null rows:  0

Total duplicated rows: 0


Unnamed: 0,Column,dType,No_Null_%,No_Null_Qty,Null_%,Null_Qty
0,item_id,[<class 'int'>],100.0,20823,0.0,0
1,recommend,[<class 'bool'>],100.0,20823,0.0,0
2,last_edited,[<class 'str'>],100.0,20823,0.0,0
3,review,[<class 'str'>],100.0,20823,0.0,0
4,funny,[<class 'str'>],100.0,20823,0.0,0
5,helpful,[<class 'str'>],100.0,20823,0.0,0
6,posted,[<class 'str'>],100.0,20823,0.0,0


In [10]:
# Function to perform sentiment analysis and assign values according to the scale
def analyze_sentiment(text):
    analysis = TextBlob(text)
    if analysis.sentiment.polarity < 0:
        return 0  # Negative sentiment
    elif analysis.sentiment.polarity == 0:
        return 1  # Neutral sentiment
    else:
        return 2  # Positive sentiment

# Apply the function to the 'review' column and create the new 'sentiment_analysis' column
df_reviews['sentiment_analysis'] = df_reviews['review'].apply(analyze_sentiment)

#### We take a quick look to see if everything is going well. In this case, positive reviews are visible, and the number 2 is assigned in the 'sentiment_analysis' column, so everything is working correctly.

In [11]:
filtered = df_reviews[['review', 'sentiment_analysis']]
filtered[filtered['sentiment_analysis'] == 2].loc[0:30,:]

Unnamed: 0,review,sentiment_analysis
0,Simple yet with great replayability. In my opi...,2
1,I know what you think when you see this title ...,2
2,This game... is so fun. The fight sequences ha...,2
8,"Amazing, Non-stop action of blowing stuff to b...",2
10,Good Game It Was Very Fun!!!!,2
12,I'm going to keep this somewhat short and swee...,2
13,I've always liked games like Terraria with the...,2
14,Best first person shooter,2
15,"One of the best sequals i have every seen, reb...",2
19,a fun game that gets ridiculously hard on hard...,2


In [12]:
# We replace the original 'review' column with the new 'sentiment_analysis'
df_reviews.drop(columns=['review'], inplace=True)

In [13]:
ut.data_overview(df_reviews)


Total rows:  20823

Total full null rows:  0

Total duplicated rows: 744


Unnamed: 0,Column,dType,No_Null_%,No_Null_Qty,Null_%,Null_Qty
0,item_id,[<class 'int'>],100.0,20823,0.0,0
1,recommend,[<class 'bool'>],100.0,20823,0.0,0
2,last_edited,[<class 'str'>],100.0,20823,0.0,0
3,funny,[<class 'str'>],100.0,20823,0.0,0
4,helpful,[<class 'str'>],100.0,20823,0.0,0
5,posted,[<class 'str'>],100.0,20823,0.0,0
6,sentiment_analysis,[<class 'int'>],100.0,20823,0.0,0


##### We remove the duplicates as visualized in the overview of the dataframe.

In [14]:
df_reviews.drop_duplicates(inplace=True)

In [15]:
ut.data_overview(df_reviews)


Total rows:  20079

Total full null rows:  0

Total duplicated rows: 0


Unnamed: 0,Column,dType,No_Null_%,No_Null_Qty,Null_%,Null_Qty
0,item_id,[<class 'int'>],100.0,20079,0.0,0
1,recommend,[<class 'bool'>],100.0,20079,0.0,0
2,last_edited,[<class 'str'>],100.0,20079,0.0,0
3,funny,[<class 'str'>],100.0,20079,0.0,0
4,helpful,[<class 'str'>],100.0,20079,0.0,0
5,posted,[<class 'str'>],100.0,20079,0.0,0
6,sentiment_analysis,[<class 'int'>],100.0,20079,0.0,0


#### Export the new dataframe with the applied transformation to CSV to process it more effectively later on

In [16]:
# Export the DataFrame to a CSV file
output_path = '../data/generated/reviews_sentiment.csv'
df_reviews.to_csv(output_path, index=False)