In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


from pathlib import Path
import random
import re

random.seed(13)

# full dataset from Kaggle
# dataset_path = Path('../../dataset/sa/dataset.csv').resolve()

# cleaned dataset
# dataset_heartless_path = Path('../../dataset/sa/dataset_cleaned_heartless.pkl').resolve()

# load with random selection
# dataset = pd.read_csv(dataset_path, skiprows=lambda i: i > 0 and random.random() > p)
# dataset.head()


# load cleaned dataset
# dataset = pd.read_pickle(dataset_heartless_path)
# dataset = dataset.sample(frac=p)
# dataset.info()

# copied from the first cell of eda.ipynb

dataset_heartless_path = Path('../../dataset/sa/dataset_cleaned_heartless.pkl').resolve()

dataset = pd.read_pickle(dataset_heartless_path)
# dataset = dataset.sample(frac=p)      # no sampling is needed

# convert the text to string object
dataset['review_text'] = dataset['review_text'].astype('str')

# drop any duplicate just in case
dataset = dataset.drop_duplicates(keep='first')

# replace -1 to 0
# then 0 = negative, 1 = positive
# for easier processing
dataset['review_score'] = dataset['review_score'].replace(-1, 0)

dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4891928 entries, 0 to 4891927
Data columns (total 6 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   index         int64 
 1   app_id        int64 
 2   app_name      object
 3   review_text   object
 4   review_score  int64 
 5   review_votes  int64 
dtypes: int64(4), object(2)
memory usage: 223.9+ MB


In [2]:
# first do data cleaning

# convert to string
dataset['review_text'] = dataset['review_text'].astype('str')

In [3]:
dataset = dataset.drop_duplicates(keep='first')

remove some characters, but not do the stop words, stemming and lemmatizing (no doing them in Keras example ??)

In [4]:
def clean(raw):
    """ Remove hyperlinks and markup """
    result = re.sub("<[a][^>]*>(.+?)</[a]>", 'Link.', raw)
    result = re.sub('&gt;', "", result)
    result = re.sub('&#x27;', "'", result)
    result = re.sub('&quot;', '"', result)
    result = re.sub('&#x2F;', ' ', result)
    result = re.sub('<p>', ' ', result)
    result = re.sub('</i>', '', result)
    result = re.sub('&#62;', '', result)
    result = re.sub('<i>', ' ', result)
    result = re.sub("\n", '', result)
    return result

In [5]:
def remove_num(texts):
   output = re.sub(r'\d+', '', texts)
   return output

In [6]:
def deEmojify(x):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'', x)

In [7]:
def unify_whitespaces(x):
    cleaned_string = re.sub(' +', ' ', x)
    return cleaned_string 

In [8]:
def remove_symbols(x):
    cleaned_string = re.sub(r"[^a-zA-Z0-9?!.,]+", ' ', x)
    return cleaned_string

In [9]:
def remove_punctuation(text):
    final = "".join(u for u in text if u not in ("?", ".", ";", ":",  "!",'"',','))
    return final

In [10]:
def cleaning(df, review_col_name):
    df[review_col_name] = df[review_col_name].apply(clean)
    df[review_col_name] = df[review_col_name].apply(deEmojify)
    # df[review_col_name] = df[review_col_name].str.lower()
    # df[review_col_name] = df[review_col_name].apply(remove_num)
    # df[review_col_name] = df[review_col_name].apply(remove_symbols)
    # df[review_col_name] = df[review_col_name].apply(remove_punctuation)

Time for cleaning

4.8M rows, two functions -> 30.5 sec

In [11]:
cleaning(dataset, 'review_text')

dataset.head(20)

Unnamed: 0,index,app_id,app_name,review_text,review_score,review_votes
0,0,10,Counter-Strike,Ruined my life.,1,0
1,1,10,Counter-Strike,This will be more of a ''my experience with th...,1,1
2,2,10,Counter-Strike,This game saved my virginity.,1,0
3,3,10,Counter-Strike,• Do you like original games? • Do you like ga...,1,0
4,4,10,Counter-Strike,"Easy to learn, hard to master.",1,1
5,5,10,Counter-Strike,"No r8 revolver, 10/10 will play again.",1,1
6,6,10,Counter-Strike,Still better than Call of Duty: Ghosts...,1,1
7,7,10,Counter-Strike,"cant buy skins, cases, keys, stickers - gaben ...",1,1
8,8,10,Counter-Strike,"Counter-Strike: Ok, after 9 years of unlimited...",1,1
9,9,10,Counter-Strike,Every server is spanish or french. I can now f...,1,0


In [12]:
# remove rows have all whitespaces
dataset['num_of_words'] = dataset['review_text'].apply(lambda x:len(str(x).split()))
dataset = dataset[dataset['num_of_words'] > 0]

dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4891259 entries, 0 to 4891927
Data columns (total 7 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   index         int64 
 1   app_id        int64 
 2   app_name      object
 3   review_text   object
 4   review_score  int64 
 5   review_votes  int64 
 6   num_of_words  int64 
dtypes: int64(5), object(2)
memory usage: 298.5+ MB


In [13]:
temp = dataset.groupby('review_score').count()['review_text'].reset_index().sort_values(by='review_score',ascending=True)
temp.style.background_gradient(cmap='Purples')

Unnamed: 0,review_score,review_text
0,0,780927
1,1,4110332


Create training, testing and validation dataset

(Training + Testing) : Validation = 8:2

In [14]:
X = dataset['review_text']
y = dataset['review_score']

In [15]:
from sklearn.model_selection import train_test_split
X_train_test, X_valid, y_train_test, y_valid = train_test_split(X, y, random_state=13, test_size=0.2)

In [16]:
print(len(X_valid))
print(len(y_valid))
print(len(X_train_test))
print(len(y_train_test))

978252
978252
3913007
3913007


Then we create a balanced dataset for training and testing

In [17]:
from imblearn.under_sampling import RandomUnderSampler

# X_train, X_test, y_train, y_test

# oversampling = RandomOverSampler(sampling_strategy=0.5)     # raise the ratio minority_data:majority_data as 1 (i.e. equal number of samples)
under = RandomUnderSampler(sampling_strategy=1.0)          # then select ? of it

# X_train_resampled, y_train_resampled = oversampling.fit_resample(X_train.to_numpy().reshape(-1, 1), y_train.to_numpy().reshape(-1, 1))
# X_train_resampled, y_train_resampled = under.fit_resample(X_train_resampled, y_train_resampled)

X_train_test_resampled, y_train__test_resampled = under.fit_resample(X_train_test.to_numpy().reshape(-1, 1), y_train_test.to_numpy().reshape(-1, 1))

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_train_test_resampled, y_train__test_resampled, random_state=13, test_size=.1)

In [19]:
print(len(X_train))
print(len(y_train))
print(len(X_test))
print(len(y_test))

1125567
1125567
125063
125063


Hugging face stuff :D

The original shape of X_train does not match with the requirement (need a flattened 1-D array of strings)

In [20]:
print(X_train.shape)
print(y_train.shape)

(1125567, 1)
(1125567,)


In [21]:
# create a dataset object for handling large amount of data
from datasets import Dataset

ds_train = Dataset.from_dict({
    "text": [str(s) for s in list(X_train.flatten())],
    "label": list(y_train)
})

  from .autonotebook import tqdm as notebook_tqdm


In [22]:
from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_dataset(data):
    # Keys of the returned dictionary will be added to the dataset as columns
    return tokenizer(data["text"], max_length=tokenizer.model_max_length, truncation=True)

# the tokenizer only accept list of strings
# tokenized_data = tokenizer(ds_train['text'], return_tensors="np", padding=True)
# Tokenizer returns a BatchEncoding, but we convert that to a dict for Keras
# tokenized_data = dict(tokenized_data)

# labels = np.array(dataset["label"])  # Label is already an array of 0 and 1

In [23]:
tokenizer.model_max_length 

512

In [24]:
ds_train

Dataset({
    features: ['text', 'label'],
    num_rows: 1125567
})

PyTorch Trainer API backend

Copy from https://huggingface.co/docs/transformers/training#train-with-pytorch-trainer

TODO: experiment on the TrainingArguments class, e.g. hyperparameters  
and also different parameters in 'Evaluate' library

Also do logging about the training loss for each epoch (read: https://discuss.huggingface.co/t/using-tensorboard-summarywriter-with-huggingface-trainerapi/23015/5 later)

In [25]:
def tokenize_dataset(data):
    # Keys of the returned dictionary will be added to the dataset as columns
    return tokenizer(data["text"], padding='max_length', max_length=tokenizer.model_max_length, truncation=True)

# apply tokenizer to the dataset
ds_train = ds_train.map(tokenize_dataset, batched=True)

Map: 100%|██████████| 1125567/1125567 [01:48<00:00, 10401.63 examples/s]


In [26]:
# selecting 1000 samples for testing the program flow

small_train_dataset = ds_train.shuffle(seed=42).select(range(1000))
small_eval_dataset = ds_train.shuffle(seed=13).select(range(1000))

In [27]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
from transformers import TrainingArguments

# Specify where to save the checkpoints from your training:
training_args = TrainingArguments(output_dir="test_trainer")

In [29]:
import evaluate

metric = evaluate.load("accuracy")

2023-11-19 23:45:37.953564: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-11-19 23:45:37.970822: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-19 23:45:37.970846: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-19 23:45:37.970858: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-19 23:45:37.974766: I tensorflow/core/platform/cpu_feature_g

In [30]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [31]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")

In [32]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

In [33]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.454304,0.817
2,No log,0.456626,0.83
3,No log,0.604056,0.836


TrainOutput(global_step=375, training_loss=0.4201847737630208, metrics={'train_runtime': 59.988, 'train_samples_per_second': 50.01, 'train_steps_per_second': 6.251, 'total_flos': 789333166080000.0, 'train_loss': 0.4201847737630208, 'epoch': 3.0})

In [35]:
trainer.evaluate()

{'eval_loss': 0.6040562987327576,
 'eval_accuracy': 0.836,
 'eval_runtime': 5.1809,
 'eval_samples_per_second': 193.016,
 'eval_steps_per_second': 24.127,
 'epoch': 3.0}

In [37]:
# save model
# save model only saves the tokenizer with the model
# https://huggingface.co/docs/transformers/v4.35.2/en/main_classes/trainer#transformers.Trainer

trainer.save_model("test_trainer_save_model_19-11-2023")

load the trained model from directory

https://discuss.huggingface.co/t/using-trainer-at-inference-time/9378/7

In [40]:
model_loaded = AutoModelForSequenceClassification.from_pretrained('test_trainer_save_model_19-11-2023')

trainer_test = Trainer(
    model = model_loaded
)

# can input TrainingArgument class to make custom evaluation on a testing dataset
# or without, then just predict
# need to form a dataset object, cannot accept string I guess ??
# if with a true label -> automatically perform evaluation
# else just predict

trainer_test.predict(small_eval_dataset)

PredictionOutput(predictions=array([[-2.994275 ,  2.559832 ],
       [ 1.9851098, -1.6966375],
       [ 2.7240963, -2.372472 ],
       ...,
       [ 2.7535512, -2.279858 ],
       [-2.5919158,  2.0749938],
       [-2.809758 ,  2.377848 ]], dtype=float32), label_ids=array([1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1,
    

why one positive one negative ??? The model itself ??

form a pipeline and test input/output

In [34]:
from transformers import pipeline

test_trainer_pipeline = pipeline(
    model=trainer,
    tokenizer=tokenizer
)

print(test_trainer_pipeline(
    "This is a good game."
))

print(test_trainer_pipeline(
    ["This is a good game.", "Bad graphics", "Game sucks !!"]
))

RuntimeError: Inferring the task automatically requires to check the hub with a model_id defined as a `str`. <transformers.trainer.Trainer object at 0x7f810af1d130> is not a valid model_id.

TODO: Save model (or pipeline)

In [None]:


test_trainer_pipeline.save_pretrained('model_test_trainer_pipeline_19-11-2023')

# trainer.save_model('model_test_trainer_13-11-2023')

TODO: Load pipeline (together with the encoder)


https://discuss.huggingface.co/t/how-to-load-a-pipeline-saved-with-pipeline-save-pretrained/5373

In [None]:
from transformers import pipeline

test_trainer_pipeline_loaded = pipeline(
    None,
    'model_test_trainer_pipeline_19-11-2023'
)

test_trainer_pipeline_loaded()

TF/Keras backend

In [None]:
ds_train = ds_train.map(tokenize_dataset)

In [None]:
from transformers import TFAutoModelForSequenceClassification
from tensorflow.keras.optimizers.legacy import Adam

# Load and compile our model
model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")

# prepare tf dataset
tf_dataset = model.prepare_tf_dataset(ds_train, batch_size=16, shuffle=True, tokenizer=tokenizer)

# Lower learning rates are often better for fine-tuning transformers
model.compile(optimizer=Adam(3e-5), metrics=['accuracy'])  # No loss argument!

model_history = model.fit(tf_dataset)

2023-11-13 23:50:04.841218: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-11-13 23:50:04.862347: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-13 23:50:04.862379: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-13 23:50:04.862396: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-13 23:50:04.866986: I tensorflow/core/platform/cpu_feature_g

  7436/125063 [>.............................] - ETA: 3:24:53 - loss: 0.2965 - accuracy: 0.8741

KeyboardInterrupt: 

Save the model

In [None]:
from datetime import datetime

model.save_pretrained(f'bert-finetune-sa-gamereviews_{datetime.now().strftime("%Y%m%d%H%M%S")}')

Load the model