# Fine-tuninig a classification model to determine review type
Mahan Madani - Mohammad Mehdi Begmaz

## Load Dataset and important libraries

In [4]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

import pandas as pd
import numpy as np
import nltk
import torch

from datasets import Dataset

from transformers import (
    AutoTokenizer,
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig

import evaluate
from evaluate import load

from pynvml import *

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
df = pd.read_csv("./dataset/BG3_reviews_more_negative.csv")  # load the preprocessed version of the dataset
print(df.columns)
print(df.shape)

Index(['review', 'voted_up', 'votes_up', 'votes_funny', 'weighted_vote_score',
       'word_count', 'profanity'],
      dtype='object')
(10000, 7)


## Model

In [7]:
id2label = {0: "Negative", 1: "Positive"}
label2id = {"Negative":0, "Positive":1}

model_name = 'gpt2'
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, id2label=id2label, label2id=label2id).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
df['label'] = df['voted_up'].astype(int)

In [9]:
df['label'].value_counts()

0    5000
1    5000
Name: label, dtype: int64

## Tokenization

In [10]:
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

In [11]:
train_dataset = Dataset.from_pandas(df[['review', 'label']])
train_dataset

Dataset({
    features: ['review', 'label'],
    num_rows: 10000
})

In [12]:
class TokenizerWrapper:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
    
    def tokenize_function(self, examples):
        self.tokenizer.truncation_side = "right"

        return self.tokenizer(
            examples["review"],
            max_length=512,
            truncation=True,
        )

In [13]:
tokenizer_wrapper = TokenizerWrapper(tokenizer)

tokenized_dataset = train_dataset.map(
    tokenizer_wrapper.tokenize_function,
    num_proc=4,
    remove_columns=train_dataset.column_names.remove('label'),
    batched=True)

Map (num_proc=4):   0%|          | 0/10000 [00:00<?, ? examples/s]

## Finetune model

In [14]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [15]:
# import accuracy evaluation metric
accuracy = evaluate.load("accuracy")

# define an evaluation function to pass into trainer later
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)

    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

In [16]:
def print_trainable_parameters(model):

    # Prints the number of trainable parameters in the model.

    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


In [17]:
config = LoraConfig(task_type="SEQ_CLS",
                        r=4,
                        lora_alpha=32,
                        lora_dropout=0.01,
                        target_modules = ['c_attn'])

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 148992 || all params: 124590336 || trainable%: 0.11958551905663052


In [18]:
# hyperparameters
lr = 1e-3
batch_size = 4
num_epochs = 5

In [19]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./gpt2-lora-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
)

In [20]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
    compute_metrics=compute_metrics
)

# train model
results = trainer.train()

  0%|          | 0/12500 [00:00<?, ?it/s]

Checkpoint destination directory ./gpt2-lora-classification\checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.5901, 'learning_rate': 0.00096, 'epoch': 0.2}
{'loss': 0.6402, 'learning_rate': 0.00092, 'epoch': 0.4}
{'loss': 0.489, 'learning_rate': 0.00088, 'epoch': 0.6}
{'loss': 0.4692, 'learning_rate': 0.00084, 'epoch': 0.8}
{'loss': 0.462, 'learning_rate': 0.0008, 'epoch': 1.0}
{'loss': 0.4135, 'learning_rate': 0.00076, 'epoch': 1.2}
{'loss': 0.4177, 'learning_rate': 0.0007199999999999999, 'epoch': 1.4}
{'loss': 0.4347, 'learning_rate': 0.00068, 'epoch': 1.6}
{'loss': 0.4525, 'learning_rate': 0.00064, 'epoch': 1.8}
{'loss': 0.4217, 'learning_rate': 0.0006, 'epoch': 2.0}
{'loss': 0.341, 'learning_rate': 0.0005600000000000001, 'epoch': 2.2}
{'loss': 0.3644, 'learning_rate': 0.0005200000000000001, 'epoch': 2.4}
{'loss': 0.3406, 'learning_rate': 0.00048, 'epoch': 2.6}
{'loss': 0.3325, 'learning_rate': 0.00044, 'epoch': 2.8}
{'loss': 0.3283, 'learning_rate': 0.0004, 'epoch': 3.0}
{'loss': 0.2729, 'learning_rate': 0.00035999999999999997, 'epoch': 3.2}
{'loss': 0.29, 'learning_rate': 0.000

In [21]:
# save model parameters
model.save_pretrained("./model/classification_v2")
tokenizer.save_pretrained("./model/classification_v2")

('./model/classification_v2\\tokenizer_config.json',
 './model/classification_v2\\special_tokens_map.json',
 './model/classification_v2\\vocab.json',
 './model/classification_v2\\merges.txt',
 './model/classification_v2\\added_tokens.json',
 './model/classification_v2\\tokenizer.json')

## Classify Reviews

In [22]:
from transformers.utils import logging
import transformers

logging.set_verbosity(transformers.logging.ERROR)

In [23]:
def classify(text):
    inputs = tokenizer.encode(text, return_tensors="pt").to(device)
    logits = model(inputs).logits
    predictions = torch.argmax(logits)

    print(id2label[predictions.tolist()])

In [24]:
generative_model = AutoModelForCausalLM.from_pretrained("./model/v3")
generative_tokenizer = AutoTokenizer.from_pretrained("./model/v3")

In [25]:
generated_text = generative_model.generate(do_sample=True, top_k=50, top_p=0.95, pad_token_id=tokenizer.pad_token_id, max_new_tokens=200)
generated_text = generative_tokenizer.decode(generated_text[0], skip_special_tokens=True)
print(generated_text)
classify(generated_text)

4/5 stars with no issues. a good story, strong characters, good choices, no problem with fights (like the original baldur's gate games i've used in the past). great replayability too!   if you're a fan of the divinity series, this game is probably good for you. the controls are well set, all of them intuitive, and the world is set in a real world environment! i don't remember my first play with this game, but i've spent a few hours through it.   still with the bugs, and some interesting twists (eg some turnbased combat which i think could make a good game), the only downside is that it is still early access (and you may get stuck sometimes), so if you are interested in early access (and still have any ideas) please look no further!  this game is well worth the price. i'm not really sure how to sum up the game's potential since we won't know much more
Positive


In [26]:
# define list of examples
text_list = ["It was good.", "just bad, not for me.", "Better than the first one.", "This is not worth watching even once.", "This one is a pass."]

print("Untrained model predictions:")
print("----------------------------")
for text in text_list:
    # tokenize text
    inputs = tokenizer.encode(text, return_tensors="pt").to(device)
    # compute logits
    logits = model(inputs).logits
    # convert logits to label
    predictions = torch.argmax(logits)

    print(text + " - " + id2label[predictions.tolist()])

Untrained model predictions:
----------------------------
It was good. - Positive
just bad, not for me. - Negative
Better than the first one. - Negative
This is not worth watching even once. - Negative
This one is a pass. - Positive


In [33]:
negative_df = df[df['label'] == 0].reset_index(drop=True)
negative_df['review'][3]

'this is early access... sure. it\'s also been in development long enough to be a full release and i don\'t see any of the major issues, namely gameplay mechanics, being fixed.  it\'s unintuitive.  for example, if you want all your players to jump to a thing, you have to select them all individually or they\'ll just stand at the spot you left them.  the grouping and ungrouping of teams is horrendous and as far as i\'ve been able to try, there is no way to select all.  it\'s a worse version of dragon age with worse graphics and worse gameplay.  i\'ve been having fun playing an mmo from 1994 "the realm online" than i have playing the 10 hours or so of this game.  there is little direction (which is ok if you consider a d&d setting), but the markers it gives you are just terrible.  another example of this is this guy who is trying to get his tieflings out of druid\'s grove...  he says, "meet me in the caves" and the marker shows the caves... yet he never moves.  you take damage from nothi

In [34]:
positive_df = df[df['label'] == 1].reset_index(drop=True)
positive_df['review'][3]

'if you like crpgs, larian does a great job with this.  early access  unfinished and some bugs, dont listen to the haters that say this game sucks because it is unfinished, its early access.'

In [35]:
classify(negative_df['review'][3])

Negative


In [36]:
classify(positive_df['review'][3])

Positive
