# Dataset Creation

(a template script for implementing each version of train/test/validation stuff)

This script is for creating balanced dataset for training/testing/validation purpose

It will change the label representing negative sentiment from -1 to 0 for easier training

It does not pre-process the data (e.g. stemming/removing symbols...)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


from pathlib import Path
import random

from sklearn.model_selection import train_test_split

random.seed(13)

dataset_heartless_path = Path('dataset_cleaned_heartless.pkl').resolve()

dataset = pd.read_pickle(dataset_heartless_path)
# dataset = dataset.sample(frac=p)      # no sampling is needed

# convert the text to string object
dataset['review_text'] = dataset['review_text'].astype('str')

# drop any duplicate just in case
dataset = dataset.drop_duplicates(keep='first')

# replace -1 to 0
# then 0 = negative, 1 = positive
# for easier processing
dataset['review_score'] = dataset['review_score'].replace(-1, 0)

dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4891928 entries, 0 to 4891927
Data columns (total 6 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   index         int64 
 1   app_id        int64 
 2   app_name      object
 3   review_text   object
 4   review_score  int64 
 5   review_votes  int64 
dtypes: int64(4), object(2)
memory usage: 223.9+ MB


In [2]:
dataset = dataset.drop_duplicates(keep='first')

In [7]:
# convert to string
dataset['review_text'] = dataset['review_text'].astype('str')

In [8]:
# remove rows have all whitespaces
dataset['num_of_words'] = dataset['review_text'].apply(lambda x:len(str(x).split()))
dataset = dataset[dataset['num_of_words'] > 0]

dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4891746 entries, 0 to 4891927
Data columns (total 7 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   index         int64 
 1   app_id        int64 
 2   app_name      object
 3   review_text   object
 4   review_score  int64 
 5   review_votes  int64 
 6   num_of_words  int64 
dtypes: int64(5), object(2)
memory usage: 298.6+ MB


In [9]:
# remove number of rows that have less than N number of characters

character_limit = 20

dataset = dataset[dataset['review_text'].str.len()>=character_limit]

In [10]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4626526 entries, 1 to 4891927
Data columns (total 7 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   index         int64 
 1   app_id        int64 
 2   app_name      object
 3   review_text   object
 4   review_score  int64 
 5   review_votes  int64 
 6   num_of_words  int64 
dtypes: int64(5), object(2)
memory usage: 282.4+ MB


In [11]:
dataset['num_of_words'].describe(percentiles=[0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99])

count    4.626526e+06
mean     6.523254e+01
std      1.098045e+02
min      1.000000e+00
1%       4.000000e+00
10%      7.000000e+00
25%      1.300000e+01
50%      2.900000e+01
75%      7.000000e+01
90%      1.550000e+02
99%      5.490000e+02
max      4.000000e+03
Name: num_of_words, dtype: float64

Train-test, validation split

In [12]:
X = dataset['review_text']
y = dataset['review_score']

In [13]:
validation_ratio = 0.2

X_train_test, X_valid, y_train_test, y_valid = train_test_split(X, y, random_state=42, test_size=validation_ratio)

In [14]:
print(len(X_valid))
print(len(y_valid))
print(len(X_train_test))
print(len(y_train_test))

925306
925306
3701220
3701220


In [24]:
X_valid

2493049                   I'm no expert, but the game is ok.
1827314    Big Map, Short Story, Waste money... DLC Wild ...
3119303    i was gonna leave my bedroom door open tonight...
648090     Review of Endless Space Pros: -Relatively easy...
3736585    These devs don't do anything but throw DLC out...
                                 ...                        
2605736    This game is like playing a movie. I was capti...
1394373    I didn't know this game was multi-player only....
3110643    WTF...i cant register on this game!? ....any o...
4407650                    It is the best coop game ever 8-)
2946958    I is confused....  To be honest I loved this g...
Name: review_text, Length: 925306, dtype: object

In [31]:
# try save validation set using joblib  (no need to do that)

list(zip(X_valid, y_valid))

[("I'm no expert, but the game is ok.", 1),
 ('Big Map, Short Story, Waste money... DLC Wild Run only added 3 new spec &amp; no use in Story Mode',
  0),
 ('i was gonna leave my bedroom door open tonight  now, i aint about that type of life.',
  1),
 ("Review of Endless Space Pros: -Relatively easy to learn compared to other 4X space games (i.e. stream-lined mechanics); tutorial itself is weak though so learning with friends help a lot -Great visuals -Good game length for 4X space game (ranges from 1 to 4 hours, depending on number of players, map size, and aggression of each player) -Diverse number of races and makes good use of the hero system -Gameplay itself is pretty good in terms of economy, expansion, and various aspects of 'empire building'  Cons: -Mediocre combat system; uses a combat card system, pretty hands-off but it's better than full auto-resolve or micro-managing every battle -Diplomacy system is rather simple -Mediocre AI (if you only play single player, then you'll ha

In [15]:
# distribution of +ve and -ve comments in train/test and valid
print('validation set')
print(y_valid.value_counts())
print()
print('train-test set')
print(y_train_test.value_counts())

validation set
review_score
1    774937
0    150369
Name: count, dtype: int64

train-test set
review_score
1    3098225
0     602995
Name: count, dtype: int64


Train, test split

Before spliting, we need to balance the number of positive and negative comments

In [17]:
from imblearn.under_sampling import RandomUnderSampler

# X_train, X_test, y_train, y_test

# oversampling = RandomOverSampler(sampling_strategy=0.5)     # raise the ratio minority_data:majority_data as 1 (i.e. equal number of samples)
under = RandomUnderSampler(sampling_strategy=1.0, random_state=13)          # then select ? of it

# X_train_resampled, y_train_resampled = oversampling.fit_resample(X_train.to_numpy().reshape(-1, 1), y_train.to_numpy().reshape(-1, 1))
# X_train_resampled, y_train_resampled = under.fit_resample(X_train_resampled, y_train_resampled)

X_train_test_resampled, y_train_test_resampled = under.fit_resample(X_train_test.to_numpy().reshape(-1, 1), y_train_test.to_numpy().reshape(-1, 1))

In [18]:
len(y_train_test_resampled)

1205990

In [19]:
test_ratio = 0.2

X_train, X_test, y_train, y_test = train_test_split(
    X_train_test_resampled,
    y_train_test_resampled, 
    random_state=13,
    test_size=test_ratio)

In [20]:
print(len(X_train))
print(len(y_train))
print(len(X_test))
print(len(y_test))

964792
964792
241198
241198


In [21]:
import collections

print('training set')
print(collections.Counter(y_train))
print()
print('testing set')
print(collections.Counter(y_test))

training set
Counter({0: 482670, 1: 482122})

testing set
Counter({1: 120873, 0: 120325})
