Create small evaluation dataset for measuring inference time difference

Run the dataset_creation.ipynb first to create a cleaned dataset before running this script

In [1]:
import pandas as pd
import numpy as np

from pathlib import Path

dataset_cleaned_heartless_cleaned = Path("dataset_cleaned_heartless_cleaned.pkl")

output_folder = Path("eval_inference")

if not output_folder.exists():
    output_folder.mkdir()

df = pd.read_pickle(dataset_cleaned_heartless_cleaned)

df

Unnamed: 0,index,app_id,app_name,review_text,review_score,review_votes,num_of_words
1,1,10,Counter-Strike,This will be more of a ''my experience with th...,1,1,449
2,2,10,Counter-Strike,This game saved my virginity.,1,0,5
3,3,10,Counter-Strike,• Do you like original games? • Do you like ga...,1,0,47
4,4,10,Counter-Strike,"Easy to learn, hard to master.",1,1,6
5,5,10,Counter-Strike,"No r8 revolver, 10/10 will play again.",1,1,7
...,...,...,...,...,...,...,...
4891923,6417101,99910,Puzzle Pirates,I really ove this game but it needs somethings...,0,0,104
4891924,6417102,99910,Puzzle Pirates,"Used to play Puzzel Pirates 'way back when', b...",0,0,86
4891925,6417103,99910,Puzzle Pirates,"This game was aright, though a bit annoying. W...",0,0,57
4891926,6417104,99910,Puzzle Pirates,"I had a nice review to recommend this game, bu...",0,0,62


In [2]:
# pass the dataframe to the cleaning function, then removing rows with empty strings
# apply all cleaning functions (like in tfidf-rf)

import sys

# setting path for common utils script
sys.path.append('../../sa')

import str_cleaning_functions

# data cleaning function, same as in the training script

def cleaning_arr(str_arr):
    '''apply all cleaning functions to a numpy array, or a pandas series object'''
    str_arr = str_arr.apply(lambda x: str_cleaning_functions.clean(x))
    str_arr = str_arr.apply(lambda x: str_cleaning_functions.deEmojify(x))
    str_arr = str_arr.apply(lambda x: x.lower())
    str_arr = str_arr.apply(lambda x: str_cleaning_functions.remove_num(x))
    str_arr = str_arr.apply(lambda x: str_cleaning_functions.remove_symbols(x))
    str_arr = str_arr.apply(lambda x: str_cleaning_functions.remove_punctuation(x))
    str_arr = str_arr.apply(lambda x: str_cleaning_functions.remove_stopword(x))
    str_arr = str_arr.apply(lambda x: str_cleaning_functions.unify_whitespaces(x))
    str_arr = str_arr.apply(lambda x: str_cleaning_functions.stemming(x))

    return str_arr


# cleaning the dataframe
df_copy = df.copy()
df_copy['review_text'] = cleaning_arr(df['review_text'])

In [3]:
# removing rows with empty strings

df_copy = df_copy[df_copy['review_text'].str.strip() != '']

df_copy

Unnamed: 0,index,app_id,app_name,review_text,review_score,review_votes,num_of_words
1,1,10,Counter-Strike,experi game type review say thing like great g...,1,1,449
2,2,10,Counter-Strike,game save virgin,1,0,5
3,3,10,Counter-Strike,like origin game like game lag like game run l...,1,0,47
4,4,10,Counter-Strike,easi learn hard master,1,1,6
5,5,10,Counter-Strike,r revolv play,1,1,7
...,...,...,...,...,...,...,...
4891923,6417101,99910,Puzzle Pirates,realli ove game need someth basic puzzl mmo pa...,0,0,104
4891924,6417102,99910,Puzzle Pirates,use play puzzel pirat way back steam hard voya...,0,0,86
4891925,6417103,99910,Puzzle Pirates,game aright though bit annoy select puzzl woul...,0,0,57
4891926,6417104,99910,Puzzle Pirates,nice review recommend game know purchas anyth ...,0,0,62


In [7]:
print('Row reduced from {} to {}. {} of rows removed.'.format(len(df), len(df_copy), len(df) - len(df_copy)))

Row reduced from 4626526 to 4623761. 2765 of rows removed.


In [4]:
# randomly select 3K rows for evaluation on inference time

df_copy_selected = df_copy.sample(n=3000, random_state=42)

df_copy_selected

Unnamed: 0,index,app_id,app_name,review_text,review_score,review_votes,num_of_words
2257489,2967714,258700,Dragon Nest Europe,best mmorpg certain fun would recommend friend...,1,0,28
2729837,3552257,288470,Fable Anniversary,peopl say pratic origin updat charact model ca...,1,0,38
4263149,5695949,461780,Moonstone Tavern,game work proper mani peopl pc issu brought de...,0,0,39
4554839,6050685,570,Dota 2,sea server life final saw support reach k puta...,1,0,20
1477524,1847128,231160,The Swapper,expect puzzl game swapper much high reccomend ...,1,0,28
...,...,...,...,...,...,...,...
1261373,1560467,22380,Fallout: New Vegas,coupl year back bare play video game total cas...,1,0,202
3166644,4193294,322920,theHunter: Primal,earli access superb buggi yes howev ooz ton po...,0,1,73
562970,623866,204880,Sins of a Solar Empire: Rebellion,love sci fi rts especi involv spaceship planet...,1,0,20
4542786,6037936,570,Dota 2,best game would choos lol,1,1,7


In [5]:
# we take the index column and use it to select from the uncleaned dataframe

df_selected = df.loc[df_copy_selected.index]

df_selected

Unnamed: 0,index,app_id,app_name,review_text,review_score,review_votes,num_of_words
2257489,2967714,258700,Dragon Nest Europe,It's not the best MMORPG out there but certain...,1,0,28
2729837,3552257,288470,Fable Anniversary,People are saying it's pratically the original...,1,0,38
4263149,5695949,461780,Moonstone Tavern,The game doesn't work properly on many people'...,0,0,39
4554839,6050685,570,Dota 2,The SEA server Life finally saw supports when ...,1,0,20
1477524,1847128,231160,The Swapper,I expected just a puzzle game but The Swapper ...,1,0,28
...,...,...,...,...,...,...,...
1261373,1560467,22380,Fallout: New Vegas,"Just a couple years back, I barely played vide...",1,0,202
3166644,4193294,322920,theHunter: Primal,"Early access was superb; buggy yes, however oo...",0,1,73
562970,623866,204880,Sins of a Solar Empire: Rebellion,"I love sci fi rts, especially if it involves s...",1,0,20
4542786,6037936,570,Dota 2,Best game 10/10 would choose over LOL,1,1,7


In [6]:
# save as pkl

df_selected.to_pickle(
    Path.joinpath(output_folder, "dataset_cleaned_heartless_cleaned_3k_eval.pkl"))