# Part 3 BERT

In [1]:
import pandas as pd
import gzip
import json
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from transformers import TrainingArguments, Trainer, BertTokenizer, BertForSequenceClassification, DataCollatorWithPadding
import torch
import numpy as np
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from bert_score import score

2024-05-25 02:59:25.799932: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-25 02:59:25.829918: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-25 02:59:25.829948: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-25 02:59:25.830837: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-25 02:59:25.836140: I tensorflow/core/platform/cpu_feature_guar

In [2]:
def read_gzip(path):
    reviews = []
    with gzip.open(path, 'rt', encoding='utf-8') as file:
        for line in file:
            review_data = json.loads(line)
            reviews.append(review_data)
    return reviews

In [3]:
reviews = read_gzip('data/Video_Games_5.json.gz')

In [4]:
for review in reviews[0:3]:
    print(review)

{'overall': 5.0, 'verified': True, 'reviewTime': '10 17, 2015', 'reviewerID': 'A1HP7NVNPFMA4N', 'asin': '0700026657', 'reviewerName': 'Ambrosia075', 'reviewText': "This game is a bit hard to get the hang of, but when you do it's great.", 'summary': "but when you do it's great.", 'unixReviewTime': 1445040000}
{'overall': 4.0, 'verified': False, 'reviewTime': '07 27, 2015', 'reviewerID': 'A1JGAP0185YJI6', 'asin': '0700026657', 'reviewerName': 'travis', 'reviewText': 'I played it a while but it was alright. The steam was a bit of trouble. The more they move these game to steam the more of a hard time I have activating and playing a game. But in spite of that it was fun, I liked it. Now I am looking forward to anno 2205 I really want to play my way to the moon.', 'summary': 'But in spite of that it was fun, I liked it', 'unixReviewTime': 1437955200}
{'overall': 3.0, 'verified': True, 'reviewTime': '02 23, 2015', 'reviewerID': 'A1YJWEXHQBWK2B', 'asin': '0700026657', 'reviewerName': 'Vincent

In [5]:
df = pd.DataFrame(reviews)

In [6]:
df = df.sample(n=10000)  

In [7]:
df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image
340479,5.0,True,"05 25, 2015",A18WBR1Q00WVY0,B00HGLLRV2,Sairis Cedano,My sister likes it!,Cool and interesting story...,1432512000,,{'Edition:': ' Standard'},
132053,5.0,True,"04 14, 2015",A3E09EY1I333MR,B0013OM528,12ealDeal,An excellent solution if you're capturing PS3 ...,Works as Intended,1428969600,,,
394635,5.0,True,"11 15, 2015",A8YYMBXZN2SEZ,B00VHWMK44,ShinMadman3rd,The novelty of these figures are great. They ...,The novelty of these figures are great. They a...,1447545600,,"{'Color:': ' Splatoon 3-pack', 'Edition:': ' U...",[https://images-na.ssl-images-amazon.com/image...
60356,5.0,True,"08 25, 2015",A2AOEN40IYMAXP,B0002XL3BA,POPPA,REAL GOOD,Five Stars,1440460800,,{'Format:': ' Video Game'},
16275,4.0,True,"01 1, 2014",A278KO9LMJAFSW,B00004TN1Z,gerald lawson,Look the header says it all the game came fast...,game cam fast and is good,1388534400,,,


In [8]:
df['overall'].value_counts()

overall
5.0    6021
4.0    1904
3.0     981
1.0     620
2.0     474
Name: count, dtype: int64

In [9]:
df = df[['reviewText', 'overall']]

In [10]:
df['sentiment'] = df['overall'].apply(lambda x: 'positive' if x > 3 else 'negative')
df['label'] = df['overall'].apply(lambda x: 1 if x > 3 else 0)

In [11]:
df['sentiment'].value_counts()

sentiment
positive    7925
negative    2075
Name: count, dtype: int64

In [12]:
df.isnull().sum()

reviewText    4
overall       0
sentiment     0
label         0
dtype: int64

In [13]:
df = df.dropna()

In [14]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Split data to features and target

In [15]:
df

Unnamed: 0,reviewText,overall,sentiment,label
340479,My sister likes it!,5.0,positive,1
132053,An excellent solution if you're capturing PS3 ...,5.0,positive,1
394635,The novelty of these figures are great. They ...,5.0,positive,1
60356,REAL GOOD,5.0,positive,1
16275,Look the header says it all the game came fast...,4.0,positive,1
...,...,...,...,...
101401,TUROK was a good game back then. But playing i...,3.0,negative,0
471495,Well i was forced to buy this Armor pack b/c p...,3.0,negative,0
200479,"Run. Run far away.\nTo be blunt, Last Rebellio...",2.0,negative,0
178535,"Cute look, but this disguises the fact they ma...",1.0,negative,0


In [16]:
feature = list(df['reviewText'])
target = list(df['label'])
X_train, X_test, y_train, y_test = train_test_split(feature, target, test_size=0.2, random_state=42, stratify=target)

#### define max_length by max_length in our data

In [17]:
X_train_tokenized = tokenizer(X_train, padding = True, truncation=True, max_length = 512)
X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length = 512)

In [18]:
X_train_tokenized.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

## Display the X_train and X_test with all keys 

In [19]:
print(X_train_tokenized[0:10])

{'input_ids': [[101, 2023, 2003, 2941, 2028, 1997, 1996, 2261, 2399, 1045, 2736, 2000, 1996, 2200, 2203, 1998, 5632, 2009, 1012, 2130, 2295, 1045, 2442, 18766, 1045, 2453, 2025, 2031, 3266, 2004, 2092, 2018, 1045, 2025, 2109, 1996, 2208, 22519, 1012, 1996, 2208, 22519, 2001, 2053, 2393, 2043, 2009, 2234, 2000, 5795, 9590, 1998, 1996, 2203, 23029, 2174, 1012, 2061, 1045, 3984, 1045, 2064, 2507, 2870, 2070, 4923, 1012, 4312, 2009, 2003, 1037, 2307, 2208, 1012, 5026, 2039, 8813, 28717, 2007, 1996, 5645, 6536, 2003, 3243, 2019, 23304, 1012, 1045, 2066, 2000, 2707, 2047, 2399, 2074, 2000, 2156, 2054, 1045, 2064, 2131, 2247, 1996, 2126, 2013, 3554, 1996, 9219, 1012, 2043, 2017, 1005, 2128, 5341, 1037, 6071, 2453, 4530, 1037, 4678, 8875, 1012, 2023, 2000, 2033, 2003, 2062, 4569, 2084, 2505, 2842, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [20]:
print(X_test_tokenized[0:10])

{'input_ids': [[101, 2573, 2986, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [21]:
length_train = len(X_train)
length_test = len(X_test)
length_train_tokenized = len(X_train_tokenized)
length_test_tokenized = len(X_test_tokenized)
print(f'length of X_train is {length_train} AND length of X_testis {length_test}')
print('\n//////////////////////////////////////////////////////////////////\n')
print(f'length of X_train_tokenized is {length_train_tokenized} AND length of X_test_tokenized is {length_test_tokenized}')

length of X_train is 7996 AND length of X_testis 2000

//////////////////////////////////////////////////////////////////

length of X_train_tokenized is 3 AND length of X_test_tokenized is 3


## Create Torch Dataset

In [22]:
class VedioGames(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.encodings['input_ids'])

In [23]:
train_df = VedioGames(X_train_tokenized, y_train)
test_df = VedioGames(X_test_tokenized, y_test)

### Display data

In [24]:
train_df[0:5]

{'input_ids': tensor([[  101,  2023,  2003,  ...,     0,     0,     0],
         [  101,  6326, 20591,  ...,     0,     0,     0],
         [  101,  2028,  1997,  ...,     0,     0,     0],
         [  101,  1045,  2293,  ...,     0,     0,     0],
         [  101,  6581,  1010,  ...,     0,     0,     0]]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([1, 1, 1, 1, 1])}

## Define an Function for evaluating The Output with Classification Metrics

In [25]:
def compute_metrics(prediction):
    print(type(prediction))
    pred, labels = prediction
    pred = np.argmax(pred, axis=1)
    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average='weighted')
    precision = precision_score(y_true=labels, y_pred=pred, average='weighted')
    f1 = f1_score(y_true=labels, y_pred=pred, average='weighted')
    
    # Compute BLEU score
    smooth = SmoothingFunction().method1
    bleu_scores = [sentence_bleu([str(label)], str(prediction), smoothing_function=smooth) for label, prediction in zip(labels, pred)]
    bleu = np.mean(bleu_scores)
    
    # Compute BERTScore
    labels_str = [str(label) for label in labels]
    pred_str = [str(p) for p in pred]
    P, R, F1 = score(pred_str, labels_str, lang="en", verbose=True)
    bert_precision = P.mean().item()
    bert_recall = R.mean().item()
    bert_f1 = F1.mean().item()
    
    return {
        "accuracy": accuracy,
        "recall": recall,
        "precision": precision,
        "f1": f1,
        "bleu": bleu,
        "bert_precision": bert_precision,
        "bert_recall": bert_recall,
        "bert_f1": bert_f1
    }

## Define Trainer

In [26]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
args = TrainingArguments(
    per_device_train_batch_size = 4,
    num_train_epochs = 3,
    learning_rate = 1e-4,
    output_dir = './Results',
    logging_dir = './logs',
    logging_steps = 10,
    load_best_model_at_end = False,
    evaluation_strategy="no",
    remove_unused_columns = False,
    push_to_hub= False
)
trainer = Trainer(
    model = model,
    args=args,
    train_dataset = train_df,
    eval_dataset = test_df,
    compute_metrics = compute_metrics,
    data_collator = data_collator
)



## Now we we'll Train The Trainer

In [27]:
trainer.train()

Step,Training Loss
10,0.6275
20,0.5484
30,0.5293
40,0.7253
50,0.6007
60,0.5267
70,0.5784
80,0.5477
90,0.5817
100,0.5533


TrainOutput(global_step=5997, training_loss=0.5241055510849958, metrics={'train_runtime': 34163.4091, 'train_samples_per_second': 0.702, 'train_steps_per_second': 0.176, 'total_flos': 6311507995975680.0, 'train_loss': 0.5241055510849958, 'epoch': 3.0})

In [28]:
model.save_pretrained('./saved_model')
tokenizer.save_pretrained('./saved_model')

('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.txt',
 './saved_model/added_tokens.json')

## Dislplay the Results by Function Compute_metrics

In [29]:
trainer.evaluate()

<class 'transformers.trainer_utils.EvalPrediction'>


  _warn_prf(average, modifier, msg_start, len(result))
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/32 [00:00<?, ?it/s]

done in 0.22 seconds, 9286.51 sentences/sec


{'eval_loss': 0.512526273727417,
 'eval_accuracy': 0.7925,
 'eval_recall': 0.7925,
 'eval_precision': 0.62805625,
 'eval_f1': 0.7007601115760111,
 'eval_bleu': 0.14092864324558466,
 'eval_bert_precision': 0.9959251880645752,
 'eval_bert_recall': 0.9959251880645752,
 'eval_bert_f1': 0.9959251880645752,
 'eval_runtime': 806.2115,
 'eval_samples_per_second': 2.481,
 'eval_steps_per_second': 0.31,
 'epoch': 3.0}

#### The BERT uncased pretrained model demonstrates strong performance in sentiment classification, with high accuracy and recall. The precision is moderately high, and the BERTScore metrics reflect the model's superior ability to capture semantic meaning. 