In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import os
import numpy as np
os.chdir('/content/drive/MyDrive/PLP Proj')
# Download required NLTK data
print("Downloading required NLTK resources...")
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
print("NLTK resources downloaded successfully!")


Downloading required NLTK resources...


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


NLTK resources downloaded successfully!


[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
df = pd.read_csv('dataset.csv')

In [None]:
df['review_text'] = df['review_text'].astype(str)

In [None]:
df_reviews = df[df['review_votes'] == 1]

In [None]:
## Remove the "Early Access Review" comments.
# These are the reviews with no comments writen by a human/reviewer.
df_reviews_1 = df_reviews[df_reviews.review_text != "Early Access Review"]
df_reviews_2 = df_reviews_1[~df_reviews_1.review_text.isin(['nan'])]
df_reviews_2

Unnamed: 0,app_id,app_name,review_text,review_score,review_votes
1,10,Counter-Strike,This will be more of a ''my experience with th...,1,1
4,10,Counter-Strike,"Easy to learn, hard to master.",1,1
5,10,Counter-Strike,"No r8 revolver, 10/10 will play again.",1,1
6,10,Counter-Strike,Still better than Call of Duty: Ghosts...,1,1
7,10,Counter-Strike,"cant buy skins, cases, keys, stickers - gaben ...",1,1
...,...,...,...,...,...
6417017,99910,Puzzle Pirates,Thought i was putting the age of my character ...,-1,1
6417039,99910,Puzzle Pirates,I care not for a godforsaken deckhand. Just be...,-1,1
6417041,99910,Puzzle Pirates,"1.no tutorial 2.gameplay looks to much casual,...",-1,1
6417095,99910,Puzzle Pirates,"A very good game, got sick of it after a while...",-1,1


In [None]:
# Drop duplicates if there is any
df_reviews_3 = df_reviews_2.drop_duplicates(['review_text', 'review_score'])
df_reviews_3

Unnamed: 0,app_id,app_name,review_text,review_score,review_votes
1,10,Counter-Strike,This will be more of a ''my experience with th...,1,1
4,10,Counter-Strike,"Easy to learn, hard to master.",1,1
5,10,Counter-Strike,"No r8 revolver, 10/10 will play again.",1,1
6,10,Counter-Strike,Still better than Call of Duty: Ghosts...,1,1
7,10,Counter-Strike,"cant buy skins, cases, keys, stickers - gaben ...",1,1
...,...,...,...,...,...
6417009,99910,Puzzle Pirates,"Too addictive, spent way too much time on this...",-1,1
6417017,99910,Puzzle Pirates,Thought i was putting the age of my character ...,-1,1
6417039,99910,Puzzle Pirates,I care not for a godforsaken deckhand. Just be...,-1,1
6417041,99910,Puzzle Pirates,"1.no tutorial 2.gameplay looks to much casual,...",-1,1


In [None]:
## Check for special symbols in more than 3000 comments
# import re
# from collections import defaultdict

# char_in_comment_counts = defaultdict(int)

# for review in df_reviews['review_text']:
#     if pd.isna(review):
#         continue
#     unique_chars = set(re.findall(r"[^a-zA-Z0-9\s.,!?']", review))
#     for char in unique_chars:
#         char_in_comment_counts[char] += 1

# char_in_comment_counts_sorted = sorted(char_in_comment_counts.items(), key=lambda x: x[1], reverse=True)

# for char, count in char_in_comment_counts_sorted:
#     print(f"'{char}': {count}")

In [None]:
def clean_text(text):
  text = re.sub(r"[♥]+", " [censored] ", text)
  text = re.sub(r"[͜͡ʖ]+", " [symbol] ", text)
  text = re.sub(r'[\u200c\u200b\u200e\u200f\u202a-\u202e\ufeff\xad]', '', text)
  text = re.sub(r'\s+', ' ', text).strip()
  return text

In [None]:
df_reviews_3['review_text_clean'] = df_reviews_3['review_text'].apply(clean_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reviews_3['review_text_clean'] = df_reviews_3['review_text'].apply(clean_text)


In [None]:
df_reviews_3

Unnamed: 0,app_id,app_name,review_text,review_score,review_votes,review_text_clean
1,10,Counter-Strike,This will be more of a ''my experience with th...,1,1,This will be more of a ''my experience with th...
4,10,Counter-Strike,"Easy to learn, hard to master.",1,1,"Easy to learn, hard to master."
5,10,Counter-Strike,"No r8 revolver, 10/10 will play again.",1,1,"No r8 revolver, 10/10 will play again."
6,10,Counter-Strike,Still better than Call of Duty: Ghosts...,1,1,Still better than Call of Duty: Ghosts...
7,10,Counter-Strike,"cant buy skins, cases, keys, stickers - gaben ...",1,1,"cant buy skins, cases, keys, stickers - gaben ..."
...,...,...,...,...,...,...
6417009,99910,Puzzle Pirates,"Too addictive, spent way too much time on this...",-1,1,"Too addictive, spent way too much time on this..."
6417017,99910,Puzzle Pirates,Thought i was putting the age of my character ...,-1,1,Thought i was putting the age of my character ...
6417039,99910,Puzzle Pirates,I care not for a godforsaken deckhand. Just be...,-1,1,I care not for a godforsaken deckhand. Just be...
6417041,99910,Puzzle Pirates,"1.no tutorial 2.gameplay looks to much casual,...",-1,1,"1.no tutorial 2.gameplay looks to much casual,..."


In [None]:
# Rule-based classification
df_reviews_3.loc[(df_reviews_3['review_score'] == 1), 'review_sentiment'] = 'Positive'
df_reviews_3.loc[(df_reviews_3['review_score'] == -1), 'review_sentiment'] = 'Negative'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reviews_3.loc[(df_reviews_3['review_score'] == 1), 'review_sentiment'] = 'Positive'


In [None]:
df_reviews_3['review_sentiment'].value_counts()

Unnamed: 0_level_0,count
review_sentiment,Unnamed: 1_level_1
Positive,478539
Negative,205568


In [None]:
# Save the cleaned data
df_reviews_3.to_csv('final_data_v3.csv', index=False)

In [4]:
import pandas as pd
import os
os.chdir('/content/drive/MyDrive/PLP Proj')
df = pd.read_csv('final_data_v4.csv')

In [6]:
len(df['app_id'].unique())

8703