Prompt engineering 27-09-2023 ver

Extract a subset of data from large dataset.

In [1]:
import pandas as pd
import numpy as np

from pathlib import Path
import random

random.seed(13)
# take 15% for both testing and training from original. Then among that 15%, 70% for training, 30% for testing.
p = 0.15

# NOTICE: change the dataset path to where your dataset.csv is.
dataset_path = Path('../fyp-workspace/dataset.csv')

# dataset = pd.read_csv(dataset_path, skiprows=lambda i: i > 0 and random.random() > p)
# dataset.head()

dataset = pd.read_csv(dataset_path)
dataset

Unnamed: 0,app_id,app_name,review_text,review_score,review_votes
0,10,Counter-Strike,Ruined my life.,1,0
1,10,Counter-Strike,This will be more of a ''my experience with th...,1,1
2,10,Counter-Strike,This game saved my virginity.,1,0
3,10,Counter-Strike,• Do you like original games? • Do you like ga...,1,0
4,10,Counter-Strike,"Easy to learn, hard to master.",1,1
...,...,...,...,...,...
6417101,99910,Puzzle Pirates,I really ove this game but it needs somethings...,-1,0
6417102,99910,Puzzle Pirates,"Used to play Puzzel Pirates 'way back when', b...",-1,0
6417103,99910,Puzzle Pirates,"This game was aright, though a bit annoying. W...",-1,0
6417104,99910,Puzzle Pirates,"I had a nice review to recommend this game, bu...",-1,0


In [2]:
# notice there are some null values
dataset.isnull().sum()

app_id               0
app_name        183234
review_text       7305
review_score         0
review_votes         0
dtype: int64

In [3]:
# We remove rows that contains null values (for both column app_name and review_text)
dataset = dataset[dataset['app_name'].isnull() == False]

dataset = dataset[dataset['review_text'].isnull() == False]

dataset.isnull().sum()

app_id          0
app_name        0
review_text     0
review_score    0
review_votes    0
dtype: int64

In [4]:
# convert columns to string type

dataset['review_text'] = dataset['review_text'].apply(str, 1)
dataset['app_name'] = dataset['app_name'].apply(str, 1)
dataset.head(10)

  dataset['review_text'] = dataset['review_text'].apply(str, 1)
  dataset['app_name'] = dataset['app_name'].apply(str, 1)


Unnamed: 0,app_id,app_name,review_text,review_score,review_votes
0,10,Counter-Strike,Ruined my life.,1,0
1,10,Counter-Strike,This will be more of a ''my experience with th...,1,1
2,10,Counter-Strike,This game saved my virginity.,1,0
3,10,Counter-Strike,• Do you like original games? • Do you like ga...,1,0
4,10,Counter-Strike,"Easy to learn, hard to master.",1,1
5,10,Counter-Strike,"No r8 revolver, 10/10 will play again.",1,1
6,10,Counter-Strike,Still better than Call of Duty: Ghosts...,1,1
7,10,Counter-Strike,"cant buy skins, cases, keys, stickers - gaben ...",1,1
8,10,Counter-Strike,"Counter-Strike: Ok, after 9 years of unlimited...",1,1
9,10,Counter-Strike,Every server is spanish or french. I can now f...,1,0


In [5]:
# we notice that there are some rows onlu contain "Early Access Review" -> not helpful to the analysis
# we remove these rows.

print("before removing:", dataset.shape)

dataset = dataset[dataset['review_text'].str.contains("Early Access Review") == False]

print("after removing:", dataset.shape)

before removing: (6226728, 5)
after removing: (5238690, 5)


There are 988038 rows that contain only "Early Access Review", which is not helpful to the analysis

In [6]:
# remove comments that contain ♥ (means foul language)

dataset_noheart = dataset[dataset['review_text'].str.contains('♥') == False]
print(dataset_noheart.shape)
print(dataset.shape)

(4891928, 5)
(5238690, 5)


In [9]:
# sample randomly for rows
dataset_sampled = dataset_noheart.sample(frac=p, random_state=13)       # keep the same random_state to create the same rows 

dataset_sampled

Unnamed: 0,app_id,app_name,review_text,review_score,review_votes
4994864,383870,Firewatch,"Honestly, I sat here for a few minutes to ques...",1,0
4540004,348250,Google Earth VR,Words can not describe how awesome experience ...,1,0
1656212,22610,Alien Breed: Impact,is fun to play if you know others who woulod p...,1,0
2841416,252950,Rocket League,Took away all my items after not playing for 2...,-1,0
4922442,379720,DOOM,I can play this with a really bad CPU (A10-580...,1,0
...,...,...,...,...,...
931947,212070,Star Conflict,Skrim in S P ace. Rift support.,1,0
2209901,239160,Thief,There once was a time when I liked this game. ...,-1,0
2222841,239820,Game Dev Tycoon,This is one of my all time favorite games. I p...,1,0
5910051,506610,Five Nights at Freddy's: Sister Location,This game is amazing!! The only glitch I exper...,1,0


In [10]:
# reset index particularly for creating dataset
dataset_sampled.reset_index(inplace=True)
dataset_sampled = dataset_sampled.rename(columns={'index': 'dataset_index'})
dataset_sampled = dataset_sampled.sort_values(by=['dataset_index'])
dataset_sampled.reset_index(inplace=True, drop=True)

dataset_sampled

Unnamed: 0,dataset_index,app_id,app_name,review_text,review_score,review_votes
0,0,10,Counter-Strike,Ruined my life.,1,0
1,7,10,Counter-Strike,"cant buy skins, cases, keys, stickers - gaben ...",1,1
2,10,10,Counter-Strike,Fire in the Hole Simulator 1999,1,0
3,17,10,Counter-Strike,"When you crouch, you lift your feet in the air...",1,1
4,46,10,Counter-Strike,With only ~4000 hours of playtime (plus a few ...,1,1
...,...,...,...,...,...,...
733784,6417083,99910,Puzzle Pirates,really boring game,-1,0
733785,6417089,99910,Puzzle Pirates,"Rating based on current state of play, as per ...",-1,0
733786,6417098,99910,Puzzle Pirates,This game is good but also horrible. Its fun t...,-1,0
733787,6417100,99910,Puzzle Pirates,This game is good but also horrible. Its fun t...,-1,0


In [14]:
# save the modified dataset as pickle
import datetime


filename = f'dataset_cleaned_heartless_sampled_{datetime.datetime.now().strftime("%Y%m%d")}.pkl'
save_path = Path(filename).resolve()
dataset_sampled.to_pickle(save_path)
print(f"Saved to: {save_path}")

Saved to: /Users/michaelcheng/Documents/MyDocs/HKU/COMP4801 FYP/hkuchatgpt_bot/dataset_cleaned_heartless_sampled_20230927.pkl


### Optional

Saving the cleaned, heartless dataset to csv and pickle.

In [12]:
# optional

# save the cleaned dataset
dataset_noheart_tosave = dataset_noheart.reset_index()


filename = 'dataset_cleaned_heartless.csv'
save_path = Path(filename).resolve()
dataset_noheart_tosave.to_csv(save_path)

print(f"Saved to: {save_path}")

Saved to: dataset_cleaned_heartless.csv


In [15]:
filename_pkl = 'dataset_cleaned_heartless.pkl'
save_path_pkl = Path(filename_pkl).resolve()
dataset_noheart_tosave.to_pickle(save_path_pkl)

print(f"Saved to: {save_path_pkl}")

Saved to: /Users/michaelcheng/Documents/MyDocs/HKU/COMP4801 FYP/hkuchatgpt_bot/dataset_cleaned_heartless.pkl


------

Creating portions of the dataset (3000 per slices, create three slices)

In [31]:
import pandas as pd
from pathlib import Path
import random

dataset_sampled_path = Path("dataset_cleaned_heartless_sampled_20230927.pkl")
dataset_sampled = pd.read_pickle(dataset_sampled_path)

dataset_sampled

Unnamed: 0,dataset_index,app_id,app_name,review_text,review_score,review_votes
0,0,10,Counter-Strike,Ruined my life.,1,0
1,7,10,Counter-Strike,"cant buy skins, cases, keys, stickers - gaben ...",1,1
2,10,10,Counter-Strike,Fire in the Hole Simulator 1999,1,0
3,17,10,Counter-Strike,"When you crouch, you lift your feet in the air...",1,1
4,46,10,Counter-Strike,With only ~4000 hours of playtime (plus a few ...,1,1
...,...,...,...,...,...,...
733784,6417083,99910,Puzzle Pirates,really boring game,-1,0
733785,6417089,99910,Puzzle Pirates,"Rating based on current state of play, as per ...",-1,0
733786,6417098,99910,Puzzle Pirates,This game is good but also horrible. Its fun t...,-1,0
733787,6417100,99910,Puzzle Pirates,This game is good but also horrible. Its fun t...,-1,0


In [35]:
# we shift them in random order.

dataset_sampled_suffled = dataset_sampled.sample(frac=1, random_state=42)   # add random_state for reproducability
dataset_sampled_suffled

Unnamed: 0,dataset_index,app_id,app_name,review_text,review_score,review_votes
272004,2265312,240760,Wasteland 2,If you're hoping for anything like fallout 1 &...,-1,1
491809,4343173,333930,Dirty Bomb,The version of Brink that doesn't suck,1,1
549814,4866998,374320,DARK SOULS™ III,"oh my got, 10 casuls of 10. now get gut",1,1
60739,456459,200510,XCOM: Enemy Unknown,A very engaging turn pased stratigy game. Any ...,1,0
308277,2630163,250320,The Wolf Among Us,9.5/10 Great story. word can't do this one jus...,1,0
...,...,...,...,...,...,...
721406,6324774,8870,BioShock Infinite,Absolutely Amazing Game! Definitely a must pla...,1,0
678684,6016504,56400,"Warhammer 40,000: Dawn of War II - Retribution",Really good real time strategy game. This gam...,1,0
162129,1262311,219990,Grim Dawn,Right now I play nothing else. It's that good....,1,0
729820,6384679,94400,Nidhogg,"Simple, yet endlessly playable. Also, a fantas...",1,0


In [38]:
chunk_size = 3000

def slice_dataframe(df, chunk_size):
    '''Creating smaller chunks of dataframes. Remaining rows will be put in the last chunk
    
    params:
    df: the large dataframe
    chunk_size: number of rows in each smaller chunk'''
    num_rows = len(df)
    num_chunks = num_rows // chunk_size
    remaining_rows = num_rows // chunk_size

    for i in range(num_chunks):
        start = i * chunk_size
        end = min((i + 1) * chunk_size, num_rows)
        yield df[start:end]

    if remaining_rows > 0:
        yield df[num_chunks * chunk_size:]

sliced_dfs = [df for df in slice_dataframe(dataset_sampled_suffled, chunk_size)]

# reset index (for HKU ChatGPT)
sliced_dfs = list(map(lambda df: df.reset_index(drop=True), sliced_dfs))


In [40]:
sliced_dfs[0]

Unnamed: 0,dataset_index,app_id,app_name,review_text,review_score,review_votes
0,2265312,240760,Wasteland 2,If you're hoping for anything like fallout 1 &...,-1,1
1,4343173,333930,Dirty Bomb,The version of Brink that doesn't suck,1,1
2,4866998,374320,DARK SOULS™ III,"oh my got, 10 casuls of 10. now get gut",1,1
3,456459,200510,XCOM: Enemy Unknown,A very engaging turn pased stratigy game. Any ...,1,0
4,2630163,250320,The Wolf Among Us,9.5/10 Great story. word can't do this one jus...,1,0
...,...,...,...,...,...,...
2995,836478,210770,Sanctum 2,It's really fun - but pretty much impossible w...,1,1
2996,3583526,290340,Armello,You play one charakter with unique abilites li...,1,0
2997,2317736,242050,Assassin's Creed IV Black Flag,Best 'Pirates of the Caribbean'-game on the ma...,1,1
2998,3566527,289130,ENDLESS™ Legend,I've been taking care of a waffle under my bed...,1,1


In [41]:
chunk_folder = Path(dataset_sampled_path.stem + '/')
if not chunk_folder.exists():
    chunk_folder.mkdir()

for i, sliced_df in enumerate(sliced_dfs):
    filename = f'{dataset_sampled_path.stem}_chunk_{i:03}.pkl'
    save_path = chunk_folder.joinpath(filename).resolve()
    sliced_df.to_pickle(save_path)
    print(f"Saved to: {save_path}")

Saved to: /Users/michaelcheng/Documents/MyDocs/HKU/COMP4801 FYP/hkuchatgpt_bot/dataset_cleaned_heartless_sampled_20230927/dataset_cleaned_heartless_sampled_20230927_chunk_000.pkl
Saved to: /Users/michaelcheng/Documents/MyDocs/HKU/COMP4801 FYP/hkuchatgpt_bot/dataset_cleaned_heartless_sampled_20230927/dataset_cleaned_heartless_sampled_20230927_chunk_001.pkl
Saved to: /Users/michaelcheng/Documents/MyDocs/HKU/COMP4801 FYP/hkuchatgpt_bot/dataset_cleaned_heartless_sampled_20230927/dataset_cleaned_heartless_sampled_20230927_chunk_002.pkl
Saved to: /Users/michaelcheng/Documents/MyDocs/HKU/COMP4801 FYP/hkuchatgpt_bot/dataset_cleaned_heartless_sampled_20230927/dataset_cleaned_heartless_sampled_20230927_chunk_003.pkl
Saved to: /Users/michaelcheng/Documents/MyDocs/HKU/COMP4801 FYP/hkuchatgpt_bot/dataset_cleaned_heartless_sampled_20230927/dataset_cleaned_heartless_sampled_20230927_chunk_004.pkl
Saved to: /Users/michaelcheng/Documents/MyDocs/HKU/COMP4801 FYP/hkuchatgpt_bot/dataset_cleaned_heartless_