Create dataset from cleaned heartless dataset

Select a game for testing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from pathlib import Path
import random

random.seed(13)

dataset_heartless_path = Path('../../dataset/sa/dataset_cleaned_heartless.pkl').resolve()

dataset = pd.read_pickle(dataset_heartless_path)
# dataset = dataset.sample(frac=p)      # no sampling is needed

# convert the text to string object
dataset['review_text'] = dataset['review_text'].astype('str')

# drop any duplicate just in case
dataset = dataset.drop_duplicates(keep='first')

# replace -1 to 0
# then 0 = negative, 1 = positive
# for easier processing
dataset['review_score'] = dataset['review_score'].replace(-1, 0)

dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4891928 entries, 0 to 4891927
Data columns (total 6 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   index         int64 
 1   app_id        int64 
 2   app_name      object
 3   review_text   object
 4   review_score  int64 
 5   review_votes  int64 
dtypes: int64(4), object(2)
memory usage: 223.9+ MB


check unique games

In [2]:
# check unique games and sort by number of reviews

# get the unique game names
unique_games_id = dataset['app_id'].unique()

# get the number of reviews for each game
unique_games_review_count = dataset.groupby(['app_id', 'app_name']).size().reset_index(name='review_count')

# sort the games by number of reviews
unique_games_review_count = unique_games_review_count.sort_values(by=['review_count'], ascending=False)

# get the top 10 games with most reviews
top_10_games = unique_games_review_count.head(10)

top_10_games


Unnamed: 0,app_id,app_name,review_count
1158,105600,Terraria,81776
1528,218620,PAYDAY 2,78327
22,570,Dota 2,66635
2094,252950,Rocket League,51209
5776,391540,Undertale,49288
21,550,Left 4 Dead 2,47680
1693,230410,Warframe,45278
23,620,Portal 2,37775
2510,271590,Grand Theft Auto V,37106
501,22380,Fallout: New Vegas,30823


In [5]:
# save the reviews of each top 10 games to a separate pkl file

for n, (index, row) in enumerate(top_10_games.iterrows()):
    app_id = row['app_id']
    app_name = row['app_name']
    print(f'Processing {app_name}...')
    game_reviews = dataset.loc[dataset['app_id'] == app_id]

    save_path = Path(f'../topic_modelling/top_10_games/{n:02}_{app_name}.pkl').resolve()
    if not save_path.parent.exists():
        save_path.parent.mkdir(parents=True)
        
    game_reviews.to_pickle(save_path)

Processing Terraria...
Processing PAYDAY 2...
Processing Dota 2...
Processing Rocket League...
Processing Undertale...
Processing Left 4 Dead 2...
Processing Warframe...
Processing Portal 2...
Processing Grand Theft Auto V...
Processing Fallout: New Vegas...
