Read df from pickle file

In [1]:
import os
from pathlib import Path
import re
import pandas as pd

In [2]:
def check_for_checkpoint(dataset_folder:Path, base_file: Path):
    '''Search for any checkpoints file. Return the latest checkpoint file (with the largest index), and the index of the file.

    Else return the base_file

    params:
    dataset_folder: the folder contains a specific dataset and all checkpoints.
    base_file: the raw file without any checkpoints suffix.
    '''
    all_pkl = []
    for root, dirs, files in os.walk(dataset_folder):
        all_pkl = list(map(lambda f: Path(root, f), files))
        all_pkl = [p for p in all_pkl if p.suffix == '.pkl']
        break       # only scan for 1 level, not looking for directories inside the folder

    # get checkpoint files, containing the keyword 'ckpt'
    checkpoint_files = [f for f in all_pkl if "ckpt" in f.name]

    if checkpoint_files:

        get_index = lambda f: int(re.search(f"{base_file.stem}_ckpt_([0-9]*){base_file.suffix}", f.name)[1])
        # filename looks like 'file_name' + '_ckpt_' + '123' + '.pkl'
        largest_index_file = max(
            checkpoint_files, key=get_index
        )
        largest_index = get_index(largest_index_file)

        return largest_index_file, largest_index
    else:
        return base_file, 0
    
def load_pickle(path_to_load:Path) -> pd.DataFrame:
    df = pd.read_pickle(path_to_load)
    print('\n')
    print(f'Successfully loaded df from {str(path_to_load)}')
    # print(df.head())
    return df

In [3]:
dataset_folder = Path('run_20230927').resolve()
base_file = Path(dataset_folder, 'dataset_noheart_20230927.pkl')

# this will get the latest checkpoint
df_filepath, curr_ckpt_index = check_for_checkpoint(dataset_folder, base_file)

In [4]:
df = load_pickle(df_filepath)

print(df.head(30))



Successfully loaded df from /Users/michaelcheng/Documents/MyDocs/HKU/COMP4801 FYP/hkuchatgpt_bot/run_20230927/dataset_noheart_20230927_ckpt_003.pkl
    dataset_index  app_id                                     app_name  \
0          746006  208140          ENDLESS™ Space - Definitive Edition   
1          693423  206440                                  To the Moon   
2         4766237  365590                    Tom Clancy's The Division   
3         3029812  261720          Holy Avatar vs. Maidens of the Dead   
4         2634999  250340                                    Blockland   
5         2581679  248630                                Kingdoms Rise   
6         3560283  289070                  Sid Meier's Civilization VI   
7         2053284  236390                                  War Thunder   
8          435035  200210                   Realm of the Mad God Exalt   
9          524907  203140                           Hitman: Absolution   
10        1157369  218620           

In [6]:
df.iloc[50:70]

Unnamed: 0,dataset_index,app_id,app_name,review_text,review_score,review_votes,response,total_token_used
50,1971365,233740,Organ Trail: Director's Cut,"Nabbing this, especially while on sale, is mos...",1,0,"{""positive"": 1.0, ""neutral"": 0.0, ""negative"": ...",249
51,4022172,312530,Duck Game,Last Edited 09 February 2016 Sco...,1,0,"{""positive"": 1.0, ""neutral"": 0.0, ""negative"": ...",149
52,5742735,47810,Dragon Age: Origins - Ultimate Edition,,1,0,,100
53,5423590,420100,CLANNAD Side Stories,So I sign into steam and much to my surprise I...,1,1,"{""positive"": 1.0, ""neutral"": 0.0, ""negative"": ...",1361
54,2402376,242920,Banished,Banished runs very smooth. The graphic is beau...,1,0,"{\n ""positive"": 0.8,\n ""neutral"": 0.1,\n ""n...",377
55,4766094,365590,Tom Clancy's The Division,The game was fun. It was fun completing the mi...,-1,0,"{""positive"": 0.1, ""neutral"": 0.4, ""negative"": ...",476
56,4680954,359550,Tom Clancy's Rainbow Six Siege,Rainbow Six Siege is definitley a great game! ...,1,0,"{""positive"": 0.9, ""neutral"": 0.1, ""negative"": 0}",347
57,1252922,219890,Antichamber,Excellent game. Even more satisfying without w...,1,0,"{""positive"": 1.0, ""neutral"": 0.0, ""negative"": ...",148
58,674667,206420,Saints Row IV,The game was AMAZING! Until I started getting ...,1,0,"{""positive"": 0.9, ""neutral"": 0.0, ""negative"": ...",132
59,5514462,43110,,"I don't normally recommend FPS games, but this...",1,0,"{\n ""positive"": 1.0,\n ""neutral"": 0.0,\n ""n...",171
