Prompt engineering 27-09-2023 ver

Extract a subset of data from large dataset.

In [1]:
import pandas as pd
import numpy as np

from pathlib import Path
import random

random.seed(13)
# take 15% for both testing and training from original. Then among that 15%, 70% for training, 30% for testing.

# NOTICE: change the dataset path to where your dataset.csv is.
dataset_path = Path('../fyp-workspace/dataset.csv').resolve()

# dataset = pd.read_csv(dataset_path, skiprows=lambda i: i > 0 and random.random() > p)
# dataset.head()

dataset = pd.read_csv(dataset_path)
dataset

FileNotFoundError: [Errno 2] No such file or directory: '..\\fyp-workspace\\dataset.csv'

In [None]:
# notice there are some null values
dataset.isnull().sum()

app_id               0
app_name        183234
review_text       7305
review_score         0
review_votes         0
dtype: int64

In [None]:
# We remove rows that contains null values (for both column app_name and review_text)
dataset = dataset[dataset['app_name'].isnull() == False]

dataset = dataset[dataset['review_text'].isnull() == False]

dataset.isnull().sum()

app_id          0
app_name        0
review_text     0
review_score    0
review_votes    0
dtype: int64

In [None]:
# convert columns to string type

dataset['review_text'] = dataset['review_text'].apply(str, 1)
dataset['app_name'] = dataset['app_name'].apply(str, 1)
dataset.head(10)

  dataset['review_text'] = dataset['review_text'].apply(str, 1)
  dataset['app_name'] = dataset['app_name'].apply(str, 1)


Unnamed: 0,app_id,app_name,review_text,review_score,review_votes
0,10,Counter-Strike,Ruined my life.,1,0
1,10,Counter-Strike,This will be more of a ''my experience with th...,1,1
2,10,Counter-Strike,This game saved my virginity.,1,0
3,10,Counter-Strike,• Do you like original games? • Do you like ga...,1,0
4,10,Counter-Strike,"Easy to learn, hard to master.",1,1
5,10,Counter-Strike,"No r8 revolver, 10/10 will play again.",1,1
6,10,Counter-Strike,Still better than Call of Duty: Ghosts...,1,1
7,10,Counter-Strike,"cant buy skins, cases, keys, stickers - gaben ...",1,1
8,10,Counter-Strike,"Counter-Strike: Ok, after 9 years of unlimited...",1,1
9,10,Counter-Strike,Every server is spanish or french. I can now f...,1,0


In [None]:
# we notice that there are some rows onlu contain "Early Access Review" -> not helpful to the analysis
# we remove these rows.

print("before removing:", dataset.shape)

dataset = dataset[dataset['review_text'].str.contains("Early Access Review") == False]

print("after removing:", dataset.shape)

before removing: (6226728, 5)
after removing: (5238690, 5)


There are 988038 rows that contain only "Early Access Review", which is not helpful to the analysis

In [None]:
# remove comments that contain ♥ (means foul language)

dataset_noheart = dataset[dataset['review_text'].str.contains('♥') == False]
print(dataset_noheart.shape)
print(dataset.shape)

(4891928, 5)
(5238690, 5)


In [None]:
# sample randomly for rows
dataset_sampled = dataset_noheart.sample(frac=p, random_state=13)       # keep the same random_state to create the same rows 

dataset_sampled

Unnamed: 0,app_id,app_name,review_text,review_score,review_votes
4994864,383870,Firewatch,"Honestly, I sat here for a few minutes to ques...",1,0
4540004,348250,Google Earth VR,Words can not describe how awesome experience ...,1,0
1656212,22610,Alien Breed: Impact,is fun to play if you know others who woulod p...,1,0
2841416,252950,Rocket League,Took away all my items after not playing for 2...,-1,0
4922442,379720,DOOM,I can play this with a really bad CPU (A10-580...,1,0
...,...,...,...,...,...
931947,212070,Star Conflict,Skrim in S P ace. Rift support.,1,0
2209901,239160,Thief,There once was a time when I liked this game. ...,-1,0
2222841,239820,Game Dev Tycoon,This is one of my all time favorite games. I p...,1,0
5910051,506610,Five Nights at Freddy's: Sister Location,This game is amazing!! The only glitch I exper...,1,0


In [None]:
# reset index particularly for creating dataset
dataset_sampled.reset_index(inplace=True)
dataset_sampled = dataset_sampled.rename(columns={'index': 'dataset_index'})
dataset_sampled = dataset_sampled.sort_values(by=['dataset_index'])
dataset_sampled.reset_index(inplace=True, drop=True)

dataset_sampled

Unnamed: 0,dataset_index,app_id,app_name,review_text,review_score,review_votes
0,0,10,Counter-Strike,Ruined my life.,1,0
1,7,10,Counter-Strike,"cant buy skins, cases, keys, stickers - gaben ...",1,1
2,10,10,Counter-Strike,Fire in the Hole Simulator 1999,1,0
3,17,10,Counter-Strike,"When you crouch, you lift your feet in the air...",1,1
4,46,10,Counter-Strike,With only ~4000 hours of playtime (plus a few ...,1,1
...,...,...,...,...,...,...
733784,6417083,99910,Puzzle Pirates,really boring game,-1,0
733785,6417089,99910,Puzzle Pirates,"Rating based on current state of play, as per ...",-1,0
733786,6417098,99910,Puzzle Pirates,This game is good but also horrible. Its fun t...,-1,0
733787,6417100,99910,Puzzle Pirates,This game is good but also horrible. Its fun t...,-1,0


In [14]:
# save the modified dataset as pickle
# import datetime


# filename = f'dataset_cleaned_heartless_sampled_{datetime.datetime.now().strftime("%Y%m%d")}.pkl'
# save_path = Path(filename).resolve()
# dataset_sampled.to_pickle(save_path)
# print(f"Saved to: {save_path}")

Saved to: /Users/michaelcheng/Documents/MyDocs/HKU/COMP4801 FYP/hkuchatgpt_bot/dataset_cleaned_heartless_sampled_20230927.pkl


### Optional

Saving the cleaned, heartless dataset to csv and pickle.

In [12]:
# optional

# save the cleaned dataset
dataset_noheart_tosave = dataset_noheart.reset_index()


filename = 'dataset_cleaned_heartless.csv'
save_path = Path(filename).resolve()
dataset_noheart_tosave.to_csv(save_path)

print(f"Saved to: {save_path}")

Saved to: dataset_cleaned_heartless.csv


In [15]:
filename_pkl = 'dataset_cleaned_heartless.pkl'
save_path_pkl = Path(filename_pkl).resolve()
dataset_noheart_tosave.to_pickle(save_path_pkl)

print(f"Saved to: {save_path_pkl}")

Saved to: /Users/michaelcheng/Documents/MyDocs/HKU/COMP4801 FYP/hkuchatgpt_bot/dataset_cleaned_heartless.pkl


------

In [1]:
# here we find the pickle files with processed ChatGPT rows

from pathlib import Path
import pandas as pd

processed_f_rpaths = [
    "dataset_cleaned_heartless_sampled_20230927/dataset_cleaned_heartless_sampled_20230927_joincleanedchunks_20231009.pkl",
    "dataset_cleaned_heartless_sampled_20231028/dataset_cleaned_heartless_sampled_20231028_joincleanedchunks_20231129.pkl"
]

processed_df = []

for rpath in processed_f_rpaths:
    processed_df.append(
        pd.read_pickle(Path(rpath).resolve())
    )

dataset_processed = pd.concat(processed_df).reset_index(drop=True)

In [2]:
dataset_processed

Unnamed: 0,dataset_index,app_id,app_name,review_text,review_score,review_votes,response,total_token_used
0,949538,212680,FTL: Faster Than Light,"Amazing Game! Great Gameplay, and Great Re-pla...",1,0,"{\n ""positive"": 1.0,\n ""neutral"": 0.0,\n ""n...",150
1,417579,19680,Alice: Madness Returns,This game is fantastic! It is both fun to pla...,1,0,"{\n ""positive"": 1.0,\n ""neutral"": 0.0,\n ""n...",195
2,2140029,238460,BattleBlock Theater,"Dripping with fun, oozing with style, practica...",1,0,"{""positive"": 1.0, ""neutral"": 0.0, ""negative"": ...",155
3,491994,201810,Wolfenstein: The New Order,Great story-driven game - one of the better st...,1,0,"{""positive"": 1.0, ""neutral"": 0.0, ""negative"": ...",149
4,3303632,272010,Aveyond 3-1: Lord of Twilight,Haven't played much of it yet but it seems lik...,-1,0,"{""positive"": 0.0, ""neutral"": 0.0, ""negative"": ...",260
...,...,...,...,...,...,...,...,...
13525,3379127,275850,No Man's Sky,I bought this game now im sad.,-1,1,"{""positive"": 0, ""neutral"": 0, ""negative"": 1}",119
13526,2894780,253710,theHunter Classic,The worst kind of Free to Play I've ever seen....,-1,0,"{""positive"": 0.0, ""neutral"": 0.0, ""negative"": ...",201
13527,4007720,311340,METAL GEAR SOLID V: GROUND ZEROES,"They took the prologue of the Phantom pain, re...",-1,1,"{""positive"": 0.1, ""neutral"": 0.4, ""negative"": ...",412
13528,1316956,220260,Farming Simulator 2013,i wished it would work,-1,0,"{""positive"": 0.0, ""neutral"": 0.5, ""negative"": ...",126


In [3]:
# distribution of positive and negative comments (original)

print('Positive commnents:', len(dataset_processed[(dataset_processed['review_score'] == 1)]))
print('Negative commnents:', len(dataset_processed[(dataset_processed['review_score'] == -1)]))

Positive commnents: 5950
Negative commnents: 7580


Creating portions of dataset, with 3000 rows per dataset

In [6]:
import pandas as pd
from pathlib import Path
import random

dataset_cleaned_path = Path("../dev-workspace/dataset/sa/dataset_cleaned_heartless.pkl")
dataset_cleaned = pd.read_pickle(dataset_cleaned_path)

dataset_cleaned

Unnamed: 0,index,app_id,app_name,review_text,review_score,review_votes
0,0,10,Counter-Strike,Ruined my life.,1,0
1,1,10,Counter-Strike,This will be more of a ''my experience with th...,1,1
2,2,10,Counter-Strike,This game saved my virginity.,1,0
3,3,10,Counter-Strike,• Do you like original games? • Do you like ga...,1,0
4,4,10,Counter-Strike,"Easy to learn, hard to master.",1,1
...,...,...,...,...,...,...
4891923,6417101,99910,Puzzle Pirates,I really ove this game but it needs somethings...,-1,0
4891924,6417102,99910,Puzzle Pirates,"Used to play Puzzel Pirates 'way back when', b...",-1,0
4891925,6417103,99910,Puzzle Pirates,"This game was aright, though a bit annoying. W...",-1,0
4891926,6417104,99910,Puzzle Pirates,"I had a nice review to recommend this game, bu...",-1,0


In [7]:
# we drop rows with the index existing in the processed datasets
dataset_cleaned_unprocessed = dataset_cleaned[
    ~dataset_cleaned['index'].isin(dataset_processed['dataset_index'])
]

dataset_cleaned_unprocessed

Unnamed: 0,index,app_id,app_name,review_text,review_score,review_votes
0,0,10,Counter-Strike,Ruined my life.,1,0
1,1,10,Counter-Strike,This will be more of a ''my experience with th...,1,1
2,2,10,Counter-Strike,This game saved my virginity.,1,0
3,3,10,Counter-Strike,• Do you like original games? • Do you like ga...,1,0
4,4,10,Counter-Strike,"Easy to learn, hard to master.",1,1
...,...,...,...,...,...,...
4891923,6417101,99910,Puzzle Pirates,I really ove this game but it needs somethings...,-1,0
4891924,6417102,99910,Puzzle Pirates,"Used to play Puzzel Pirates 'way back when', b...",-1,0
4891925,6417103,99910,Puzzle Pirates,"This game was aright, though a bit annoying. W...",-1,0
4891926,6417104,99910,Puzzle Pirates,"I had a nice review to recommend this game, bu...",-1,0


In [21]:
# optional (only negative comments)
# dataset_cleaned_unprocessed = dataset_cleaned_unprocessed[dataset_cleaned_unprocessed['review_score'] == -1]

# len(dataset_cleaned_unprocessed)

779885

In [22]:
# we shift them in random order.

dataset_cleaned_suffled = dataset_cleaned_unprocessed.sample(frac=1, random_state=42)   # add random_state for reproducability
dataset_cleaned_suffled

Unnamed: 0,index,app_id,app_name,review_text,review_score,review_votes
3578009,4753211,364360,Total War: WARHAMMER,Bought the base game at 25% off sale. No DLCs....,-1,1
1417392,1779512,230190,War for the Overworld,"it's a fun game, and a really good follow-up f...",-1,0
4768597,6281593,8000,Tomb Raider: Anniversary,6/10 Horrible PC Port Clipping/glitching/fal...,-1,0
3354944,4438017,340460,Spartans Vs Zombies Defense,In endless mode is very stupid,-1,0
895169,1051934,216150,MapleStory,MapleStory is that one game you're probably ne...,-1,1
...,...,...,...,...,...,...
1846268,2300084,241930,Middle-earth™: Shadow of Mordor™,"Gameplay is super boring, mindless attack butt...",-1,1
2518729,3298407,271590,Grand Theft Auto V,s h i t game dont get it it doesnt deserve the...,-1,0
936815,1098709,218620,PAYDAY 2,"The game is updating all the time, so it can't...",-1,0
4104612,5420433,420,Half-Life 2: Episode Two,Cliffhanger ending no episode 3 or HL 3. Unti...,-1,1


In [23]:
chunk_size = 3000

def slice_dataframe(df, chunk_size, N=-1):
    '''Creating smaller chunks of dataframes. Remaining rows will be put in the last chunk
    
    params:
    df: the large dataframe
    chunk_size: number of rows in each smaller chunk
    N: number of chunks to be created'''
    num_rows = len(df)
    num_chunks = num_rows // chunk_size
    remaining_rows = num_rows // chunk_size


    if 0 < N and N < num_chunks + 1:
        # valid value
        # create only specific number of chunks
        # drop remaining rows
        
        for i in range(N):
            start = i * chunk_size
            end = min((i + 1) * chunk_size, num_rows)
            yield df[start:end]

    else:
        # create a total of (num_chunks + 1) chunks

        # create num_chunks chunks
        for i in range(num_chunks):
            start = i * chunk_size
            end = min((i + 1) * chunk_size, num_rows)
            yield df[start:end]

        # last chunk, size of last chunk will be <= chunk_size
        if remaining_rows > 0:
            yield df[num_chunks * chunk_size:]

# sliced_dfs = [df for df in slice_dataframe(dataset_cleaned_suffled, chunk_size)]
sliced_dfs = [df for df in slice_dataframe(dataset_cleaned_suffled, chunk_size, 3)]

# reset index (for HKU ChatGPT)
sliced_dfs = list(map(lambda df: df.reset_index(drop=True), sliced_dfs))


In [24]:
len(sliced_dfs)

3

In [25]:
sliced_dfs[1]

Unnamed: 0,index,app_id,app_name,review_text,review_score,review_votes
0,994303,214420,Gear Up,Matchmaking is bad you get put with people way...,-1,0
1,5921434,513000,Kuboom,A pay-to-win piece of trash. Did I just waste ...,-1,0
2,1586410,22380,Fallout: New Vegas,"I love this game, but it's so full of bugs you...",-1,0
3,1769573,228880,Ashes of the Singularity: Classic,"This game is either to easy or to hard, no cha...",-1,0
4,416823,1930,Two Worlds: Epic Edition,If you have Skyrim dont waste your time here,-1,0
...,...,...,...,...,...,...
2995,4586933,351800,Dev Guy,this game isnt fun or entertaining.,-1,1
2996,4038944,313010,Cities XXL,This 'game' is the advertising campaign that P...,-1,0
2997,3240464,270850,Car Mechanic Simulator 2014,"Terrible game , not worth 1$ , extreemly borin...",-1,0
2998,5998291,550,Left 4 Dead 2,Fun to play with friends,-1,0


In [30]:
from datetime import datetime


chunk_folder = Path(dataset_cleaned_path.stem + '_sampled_' + datetime.now().strftime("%Y%m%d") + '/')

if not chunk_folder.exists():
    chunk_folder.mkdir()

for i, sliced_df in enumerate(sliced_dfs):
    filename = f'{chunk_folder.stem}_chunk_{i:03}.pkl'
    save_path = chunk_folder.joinpath(filename).resolve()
    sliced_df.to_pickle(save_path)
    print(f"Saved to: {save_path}")

Saved to: C:\Users\uset\Documents\MyDocs\HKU\COMP4801\FYP\NLP\sa_hkuchatgpt\dataset_cleaned_heartless_sampled_20231028\dataset_cleaned_heartless_sampled_20231028_chunk_000.pkl
Saved to: C:\Users\uset\Documents\MyDocs\HKU\COMP4801\FYP\NLP\sa_hkuchatgpt\dataset_cleaned_heartless_sampled_20231028\dataset_cleaned_heartless_sampled_20231028_chunk_001.pkl
Saved to: C:\Users\uset\Documents\MyDocs\HKU\COMP4801\FYP\NLP\sa_hkuchatgpt\dataset_cleaned_heartless_sampled_20231028\dataset_cleaned_heartless_sampled_20231028_chunk_002.pkl


---

29/11/2023 update

In [34]:
# preprocessing

dataset_cleaned_unprocessed['num_of_words'] = dataset_cleaned_unprocessed['review_text'].apply(lambda x:len(str(x).split()))
dataset_cleaned_unprocessed = dataset_cleaned_unprocessed[dataset_cleaned_unprocessed['num_of_words'] > 0]

# at least 10 characters to be more meaningful :D
# or 5 words ??
dataset_cleaned_unprocessed = dataset_cleaned_unprocessed[dataset_cleaned_unprocessed['review_text'].str.len()>=10]

In [35]:
dataset_cleaned_unprocessed.head(10)

Unnamed: 0,index,app_id,app_name,review_text,review_score,review_votes,num_of_words
0,0,10,Counter-Strike,Ruined my life.,1,0,3
1,1,10,Counter-Strike,This will be more of a ''my experience with th...,1,1,449
2,2,10,Counter-Strike,This game saved my virginity.,1,0,5
3,3,10,Counter-Strike,• Do you like original games? • Do you like ga...,1,0,47
4,4,10,Counter-Strike,"Easy to learn, hard to master.",1,1,6
5,5,10,Counter-Strike,"No r8 revolver, 10/10 will play again.",1,1,7
6,6,10,Counter-Strike,Still better than Call of Duty: Ghosts...,1,1,7
7,7,10,Counter-Strike,"cant buy skins, cases, keys, stickers - gaben ...",1,1,16
8,8,10,Counter-Strike,"Counter-Strike: Ok, after 9 years of unlimited...",1,1,174
9,9,10,Counter-Strike,Every server is spanish or french. I can now f...,1,0,15


In [36]:
dataset_cleaned_unprocessed

Unnamed: 0,index,app_id,app_name,review_text,review_score,review_votes,num_of_words
0,0,10,Counter-Strike,Ruined my life.,1,0,3
1,1,10,Counter-Strike,This will be more of a ''my experience with th...,1,1,449
2,2,10,Counter-Strike,This game saved my virginity.,1,0,5
3,3,10,Counter-Strike,• Do you like original games? • Do you like ga...,1,0,47
4,4,10,Counter-Strike,"Easy to learn, hard to master.",1,1,6
...,...,...,...,...,...,...,...
4891923,6417101,99910,Puzzle Pirates,I really ove this game but it needs somethings...,-1,0,104
4891924,6417102,99910,Puzzle Pirates,"Used to play Puzzel Pirates 'way back when', b...",-1,0,86
4891925,6417103,99910,Puzzle Pirates,"This game was aright, though a bit annoying. W...",-1,0,57
4891926,6417104,99910,Puzzle Pirates,"I had a nice review to recommend this game, bu...",-1,0,62


In [37]:
dataset_cleaned_unprocessed.drop(columns=['num_of_words'], inplace=True)

dataset_cleaned_unprocessed

Unnamed: 0,index,app_id,app_name,review_text,review_score,review_votes
0,0,10,Counter-Strike,Ruined my life.,1,0
1,1,10,Counter-Strike,This will be more of a ''my experience with th...,1,1
2,2,10,Counter-Strike,This game saved my virginity.,1,0
3,3,10,Counter-Strike,• Do you like original games? • Do you like ga...,1,0
4,4,10,Counter-Strike,"Easy to learn, hard to master.",1,1
...,...,...,...,...,...,...
4891923,6417101,99910,Puzzle Pirates,I really ove this game but it needs somethings...,-1,0
4891924,6417102,99910,Puzzle Pirates,"Used to play Puzzel Pirates 'way back when', b...",-1,0
4891925,6417103,99910,Puzzle Pirates,"This game was aright, though a bit annoying. W...",-1,0
4891926,6417104,99910,Puzzle Pirates,"I had a nice review to recommend this game, bu...",-1,0


In [38]:
# optional
# create 3000 * 3 datasets such that each sub dataset contains equal number of postiive and negative comments

dataset_cleaned_unprocessed_pos = dataset_cleaned_unprocessed[dataset_cleaned_unprocessed['review_score'] == 1]
dataset_cleaned_unprocessed_neg = dataset_cleaned_unprocessed[dataset_cleaned_unprocessed['review_score'] == -1]

# shuffle them

dataset_cleaned_unprocessed_pos = dataset_cleaned_unprocessed_pos.sample(frac=1, random_state=13)
dataset_cleaned_unprocessed_neg = dataset_cleaned_unprocessed_neg.sample(frac=1, random_state=42)

In [39]:
chunk_size = 3000 // 2

def slice_dataframe(df, chunk_size, N=-1):
    '''Creating smaller chunks of dataframes. Remaining rows will be put in the last chunk
    
    params:
    df: the large dataframe
    chunk_size: number of rows in each smaller chunk
    N: number of chunks to be created'''
    num_rows = len(df)
    num_chunks = num_rows // chunk_size
    remaining_rows = num_rows // chunk_size


    if 0 < N and N < num_chunks + 1:
        # valid value
        # create only specific number of chunks
        # drop remaining rows
        
        for i in range(N):
            start = i * chunk_size
            end = min((i + 1) * chunk_size, num_rows)
            yield df[start:end]

    else:
        # create a total of (num_chunks + 1) chunks

        # create num_chunks chunks
        for i in range(num_chunks):
            start = i * chunk_size
            end = min((i + 1) * chunk_size, num_rows)
            yield df[start:end]

        # last chunk, size of last chunk will be <= chunk_size
        if remaining_rows > 0:
            yield df[num_chunks * chunk_size:]

# sliced_dfs = [df for df in slice_dataframe(dataset_cleaned_suffled, chunk_size)]
sliced_dfs_pos = [df for df in slice_dataframe(dataset_cleaned_unprocessed_pos, chunk_size, 3)]

# reset index (for HKU ChatGPT)
sliced_dfs_pos = list(map(lambda df: df.reset_index(drop=True), sliced_dfs_pos))

# negative comments
sliced_dfs_neg = [df for df in slice_dataframe(dataset_cleaned_unprocessed_neg, chunk_size, 3)]

# reset index (for HKU ChatGPT)
sliced_dfs_neg = list(map(lambda df: df.reset_index(drop=True), sliced_dfs_neg))

In [40]:
len(sliced_dfs_neg)

3

In [43]:
# join each of them in pos and neg

sliced_dfs = []

for df_pos, df_neg in zip(sliced_dfs_pos, sliced_dfs_neg):
    df = pd.concat([df_pos, df_neg]).sample(frac=1)
    df.rename(columns={'index': 'dataset_index'}, inplace=True)
    df.reset_index(drop=True, inplace=True)

    sliced_dfs.append(df)

In [44]:
sliced_dfs[0]

Unnamed: 0,dataset_index,app_id,app_name,review_text,review_score,review_votes
0,4121377,319510,Five Nights at Freddy's,Very fun game if you enjoy horror games. At fi...,1,0
1,1985382,234140,Mad Max,What the game lacks in depth of story and clun...,1,0
2,3924843,306660,Ultimate General: Gettysburg,No matter what strategy you choose it always s...,-1,1
3,5008454,38410,Fallout 2,"'Fallout? I'll make my own Fallout, with black...",1,1
4,5562984,438040,Shakes and Fidget,"This is a fun little game, fun idea, but unfor...",-1,1
...,...,...,...,...,...,...
2995,6247807,71250,Sonic Adventure DX,One word. UNPLAYABLE!! Were do I begin? At the...,-1,0
2996,5189334,394230,Battleborn,I get so bored waiting for the Matchfinding Qu...,-1,0
2997,3602969,291480,Warface,Would not launch. Reinstalled and would not la...,-1,0
2998,4536993,347670,Karate Master 2 Knock Down Blow,Fighting was cool but the rest of the game fel...,-1,0


In [45]:
from datetime import datetime


chunk_folder = Path(dataset_cleaned_path.stem + '_sampled_' + datetime.now().strftime("%Y%m%d") + '/')

if not chunk_folder.exists():
    chunk_folder.mkdir()

for i, sliced_df in enumerate(sliced_dfs):
    filename = f'{chunk_folder.stem}_chunk_{i:03}.pkl'
    save_path = chunk_folder.joinpath(filename).resolve()
    sliced_df.to_pickle(save_path)
    print(f"Saved to: {save_path}")

Saved to: /Users/michaelcheng/Documents/MyDocs/HKU/COMP4801 FYP/FYP/NLP/sa_hkuchatgpt/dataset_cleaned_heartless_sampled_20231129/dataset_cleaned_heartless_sampled_20231129_chunk_000.pkl
Saved to: /Users/michaelcheng/Documents/MyDocs/HKU/COMP4801 FYP/FYP/NLP/sa_hkuchatgpt/dataset_cleaned_heartless_sampled_20231129/dataset_cleaned_heartless_sampled_20231129_chunk_001.pkl
Saved to: /Users/michaelcheng/Documents/MyDocs/HKU/COMP4801 FYP/FYP/NLP/sa_hkuchatgpt/dataset_cleaned_heartless_sampled_20231129/dataset_cleaned_heartless_sampled_20231129_chunk_002.pkl
