Create small evaluation dataset for measuring inference time difference

Run the dataset_creation.ipynb first to create a cleaned dataset before running this script

In [1]:
import pandas as pd
import numpy as np

from pathlib import Path

dataset_cleaned_heartless_cleaned = Path("dataset_heartless_20240116.pkl")

output_folder = Path("eval_inference")

if not output_folder.exists():
    output_folder.mkdir()

df = pd.read_pickle(dataset_cleaned_heartless_cleaned)

df

Unnamed: 0,index,app_id,app_name,review_text,review_score,review_votes
0,0,10,Counter-Strike,Ruined my life.,1,0
1,1,10,Counter-Strike,This will be more of a ''my experience with th...,1,1
2,2,10,Counter-Strike,This game saved my virginity.,1,0
3,3,10,Counter-Strike,• Do you like original games? • Do you like ga...,1,0
4,4,10,Counter-Strike,"Easy to learn, hard to master.",1,1
...,...,...,...,...,...,...
4180143,6417101,99910,Puzzle Pirates,I really ove this game but it needs somethings...,-1,0
4180144,6417102,99910,Puzzle Pirates,"Used to play Puzzel Pirates 'way back when', b...",-1,0
4180145,6417103,99910,Puzzle Pirates,"This game was aright, though a bit annoying. W...",-1,0
4180146,6417104,99910,Puzzle Pirates,"I had a nice review to recommend this game, bu...",-1,0


In [2]:
# instead of processing all data in the dataset
# we sample 3K * 2 data, then process it
# to save time

df = df.sample(3000 * 2)

In [3]:
# pass the dataframe to the cleaning function, then removing rows with empty strings
# apply all cleaning functions (like in tfidf-rf)

import sys

# setting path for common utils script
sys.path.append('../../sa')

import str_cleaning_functions

# data cleaning function, same as in the training script

# def cleaning_arr(str_arr):
#     '''apply all cleaning functions to a numpy array, or a pandas series object'''
#     str_arr = str_arr.apply(lambda x: str_cleaning_functions.remove_links(x))
#     str_arr = str_arr.apply(lambda x: str_cleaning_functions.remove_links2(x))
#     str_arr = str_arr.apply(lambda x: str_cleaning_functions.clean(x))
#     str_arr = str_arr.apply(lambda x: str_cleaning_functions.deEmojify(x))
#     str_arr = str_arr.apply(lambda x: str_cleaning_functions.remove_non_letters(x))
#     str_arr = str_arr.apply(lambda x: x.lower())
#     str_arr = str_arr.apply(lambda x: str_cleaning_functions.unify_whitespaces(x))
#     str_arr = str_arr.apply(lambda x: str_cleaning_functions.remove_stopword(x))
#     str_arr = str_arr.apply(lambda x: str_cleaning_functions.unify_whitespaces(x))
#     str_arr = str_arr.apply(lambda x: str_cleaning_functions.stemming(x))
#     str_arr = str_arr.apply(lambda x: str_cleaning_functions.unify_whitespaces(x))

#     return str_arr


def cleaning_str(s):
    s = str_cleaning_functions.remove_links(s)
    s = str_cleaning_functions.remove_links2(s)
    s = str_cleaning_functions.clean(s)
    s = str_cleaning_functions.deEmojify(s)
    s = str_cleaning_functions.remove_non_letters(s)
    s = s.lower()
    s = str_cleaning_functions.unify_whitespaces(s)
    s = str_cleaning_functions.remove_stopword(s)
    s = str_cleaning_functions.unify_whitespaces(s)
    s = str_cleaning_functions.stemming(s)
    s = str_cleaning_functions.unify_whitespaces(s)

    return s

def cleaning_hg(data):
    return {'review_text2': cleaning_str(data['review_text'])}


# cleaning the dataframe
df_copy = df.copy()

# use huggingface dataset for parallel cleaning
from datasets import Dataset

temp_dataset = Dataset.from_dict({'review_text': list(df['review_text'])})
temp_dataset = temp_dataset.map(cleaning_hg, num_proc=4)
df_copy['review_text'] = temp_dataset['review_text2']
# df_copy = df_copy[df_copy['review_text'].apply(len) > 0]

# df_copy['review_text'] = cleaning_arr(df['review_text'])

  from .autonotebook import tqdm as notebook_tqdm
Map (num_proc=4): 100%|██████████| 6000/6000 [00:00<00:00, 7198.82 examples/s]
  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


In [4]:
# removing rows with empty strings

df_copy = df_copy[df_copy['review_text'].str.strip() != '']

df_copy

Unnamed: 0,index,app_id,app_name,review_text,review_score,review_votes
3810662,5915475,51100,Tactical Intervention,mission great gameplay nice map good engin gam...,1,0
287113,357922,15120,Tom Clancy's Rainbow Six: Vegas 2,one rare gem got hi tech time face scan creat ...,1,0
4067236,6273547,78000,Bejeweled 3,alway help kill boredom,1,0
445574,554437,203510,Fortune Summoners: Secret of the Elemental Stone,game total moe desu make ha go kyu n otom soul...,1,1
3884472,6027878,570,Dota 2,play sniper went kill death play bounti hunter...,1,0
...,...,...,...,...,...,...
248395,313068,12210,Grand Theft Auto IV: The Complete Edition,work even got new pc good one still work wast ...,-1,0
205844,248212,113200,The Binding of Isaac,fantast fun ton item simpl yet challeng gamepl...,1,0
1250731,1830289,230410,Warframe,fun bit real endgam content grind grind bigger...,1,0
606795,781196,208750,Apotheon,found one best independ game play long time ar...,1,0


In [5]:
print('Row reduced from {} to {}. {} of rows removed.'.format(len(df), len(df_copy), len(df) - len(df_copy)))

Row reduced from 6000 to 5965. 35 of rows removed.


In [6]:
# randomly select 3K rows for evaluation on inference time

df_copy_selected = df_copy.sample(n=3000, random_state=42)

df_copy_selected

Unnamed: 0,index,app_id,app_name,review_text,review_score,review_votes
771613,1039320,215100,Ragnarok,great game,1,0
3214657,4993292,383870,Firewatch,review said game short someth face anoth probl...,-1,1
338242,423763,19930,The Settlers: Rise of an Empire Gold Edition,one best settler game enjoy lot,1,0
330170,414339,18500,Defense Grid: The Awakening,fun tower defens game worth price,1,0
1007356,1468948,222750,Wargame: AirLand Battle,came plane saw top bout dont show flip plane f...,-1,0
...,...,...,...,...,...,...
788945,1059684,217140,Rise of the Triad,rise triad game millennium,1,0
3998042,6189447,65300,Dustforce,trick becom bore play one level control wonki ...,-1,0
1204295,1772027,229600,Bientôt l'été,bought gift friend feel sorri friend suppos ar...,-1,1
1887241,2944219,256330,WRC 4 FIA WORLD RALLY CHAMPIONSHIP,never play onlin one play game anymor alreadi ...,-1,0


In [7]:
# we take the index column and use it to select from the uncleaned dataframe

df_selected = df.loc[df_copy_selected.index]

df_selected

Unnamed: 0,index,app_id,app_name,review_text,review_score,review_votes
771613,1039320,215100,Ragnarok,great game,1,0
3214657,4993292,383870,Firewatch,All the other reviewers said the game was too ...,-1,1
338242,423763,19930,The Settlers: Rise of an Empire Gold Edition,One of the best Settlers games. Enjoy it a lot!,1,0
330170,414339,18500,Defense Grid: The Awakening,Fun tower defense game. Worth the price.,1,0
1007356,1468948,222750,Wargame: AirLand Battle,"I came here for planes, saw that its top down?...",-1,0
...,...,...,...,...,...,...
788945,1059684,217140,Rise of the Triad,Rise of the Triad is Game of the Millennium!,1,0
3998042,6189447,65300,Dustforce,Eh... all the tricks become boring and played ...,-1,0
1204295,1772027,229600,Bientôt l'été,i bought this for myself and gifted it to a fr...,-1,1
1887241,2944219,256330,WRC 4 FIA WORLD RALLY CHAMPIONSHIP,I never played online because no one plays thi...,-1,0


In [8]:
# save as pkl

df_selected.to_pickle(
    Path.joinpath(output_folder, "dataset_heartless_20240116_3k_eval.pkl"))