In [1]:
pip install transformers datasets scikit-learn torch datasets

/bin/bash: /libraries/llm_gpu_mistral/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Note: you may need to restart the kernel to use updated packages.


In [2]:
!nvidia-smi

/bin/bash: /libraries/llm_gpu_mistral/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Thu Dec  5 12:35:29 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla V100-SXM2-16GB           Off | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0              32W / 300W |      0MiB / 16384MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+-

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
from mlutils import connector
import io
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import os
import torch

In [5]:
connector_name = 'cat-ds-gcs'
bucket_name = "bucket"
conn = connector.get_connector(name=connector_name)
bucket = conn.get_bucket(bucket_name)
sampled_data_path = "data_engg/sampled_book_df.csv"

In [6]:
blob = bucket.blob(sampled_data_path)

# Download the file content as bytes
data = blob.download_as_bytes()

# Use io.BytesIO to load the content into a DataFrame
sampled_df = pd.read_csv(io.BytesIO(data))

# Display the DataFrame
sampled_df.columns

Index(['categories', 'ratingsCount', 'Title', 'Id', 'Price', 'User_id',
       'profileName', 'review/helpfulness', 'review/score', 'review/time',
       'review/summary', 'review/text'],
      dtype='object')

In [6]:
sampled_df.head()

Unnamed: 0,categories,ratingsCount,Title,Id,Price,User_id,profileName,review/helpfulness,review/score,review/time,review/summary,review/text
0,AIDS (Disease),19.0,Miracle Cure,B000J2CLPG,,A33MJKWZSQHIVS,Nevada Native,0.0,4.0,1346544000,"Not my favorite by Coben, but worth reading",I agree with Coben on this one - don't read th...
1,AIDS (Disease),19.0,Miracle Cure,B000J2CLPG,,A2UBD96PEJ0QSE,silver5,5.0,5.0,1323475200,Pleasant surprise!,\When I saw this title I thought...what? I've ...
2,AIDS (Disease),0.0,Get All The Facts: HIV does not cause AIDS,0967353602,,A30DOF1UJ8QHX,Betty Hyder,5.0,5.0,981504000,Grateful,My thanks and appreciation to Dr. Al-Bayati fo...
3,Aboriginal Australians,0.0,"Songlines (Reed Audio, 179)",186021990X,,A38BIVW2RNO3RW,"\An admirer of Saul \""\""Mr Wobble\""\""\""""",0.0,5.0,1301616000,Life is a Long Song,"Part travelogue,part anthropology,part history..."
4,Abortion,21.0,Case of Need,B000QKUCJI,,A2PZXXPGLXXKZU,Amazon Reviewer29,5.0,3.0,1330128000,First book written by Michael Crichton,I read this book last year. I really wanted so...


In [7]:
sampled_df = sampled_df.dropna(subset=['review/summary', 'review/text', 'review/score'])
sampled_df = sampled_df.assign(labels=sampled_df['review/score'].map(lambda score: 2 if score > 3.5 else (1 if score > 2.5 else 0)))

In [18]:
sampled_df['review/text'] = sampled_df['review/text'].astype(str)

In [9]:
sampled_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 78599 entries, 0 to 78616
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   categories          78599 non-null  object 
 1   ratingsCount        78599 non-null  float64
 2   Title               78599 non-null  object 
 3   Id                  78599 non-null  object 
 4   Price               12954 non-null  float64
 5   User_id             63406 non-null  object 
 6   profileName         63400 non-null  object 
 7   review/helpfulness  78599 non-null  float64
 8   review/score        78599 non-null  float64
 9   review/time         78599 non-null  object 
 10  review/summary      78599 non-null  object 
 11  review/text         78599 non-null  object 
 12  labels              78599 non-null  int64  
dtypes: float64(4), int64(1), object(8)
memory usage: 8.4+ MB


In [10]:
train_df, test_df = train_test_split(sampled_df, train_size=0.8, random_state=24, stratify = sampled_df['labels'])
rest_df, test_df = train_test_split(sampled_df, test_size=0.1, random_state=24, stratify = sampled_df['labels'])

In [11]:
sampled_df['labels'].value_counts()
class_2 = train_df[train_df['labels'] == 2]
class_0 = train_df[train_df['labels'] == 0]
class_1 = train_df[train_df['labels'] == 1]

In [12]:
min_class_size = min(class_0.shape[0], class_1.shape[0])
class_2_undersampled = class_2.sample(n=min_class_size, random_state=24)
class_0_undersampled = class_0.sample(n=min_class_size, random_state=24)
balanced_df = pd.concat([class_2_undersampled, class_0_undersampled, class_1])

balanced_df['labels'].value_counts()

labels
2    5208
0    5208
1    5208
Name: count, dtype: int64

In [13]:
train_dataset = Dataset.from_pandas(balanced_df)
test_dataset = Dataset.from_pandas(test_df)

In [14]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", cache_dir="/data/bert_cache")

Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`,  it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again. You will see the new `added_tokens_decoder` attribute that will store the relevant information.


In [15]:
def tokenize(data):

    encoding = tokenizer.encode_plus(
        data['review/text'],
        max_length=512,
        truncation=True,
        padding='max_length',
        return_tensors='pt',
        add_special_tokens=True
    )

    return {
        'input_ids': encoding['input_ids'].squeeze(0),  # Remove batch dimension (if necessary)
        'attention_mask': encoding['attention_mask'].squeeze(0),  # Remove batch dimension (if necessary)
        'token_type_ids': encoding.get('token_type_ids', None).squeeze(0)  # Optional: only needed for some models
    }

In [16]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

In [17]:
train = train_dataset.map(tokenize, batched = False)
test =  test_dataset.map(tokenize,  batched = False)

Map: 100%|██████████| 15624/15624 [00:51<00:00, 304.88 examples/s]
Map: 100%|██████████| 7860/7860 [00:24<00:00, 314.91 examples/s]


In [19]:
train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'token_type_ids', 'labels'])
test.set_format(type='torch', columns=['input_ids', 'attention_mask', 'token_type_ids', 'labels'])

In [20]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 3, cache_dir="/data/bert_cache")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
training_args = TrainingArguments(
    output_dir='/data/results',
    report_to="none",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    warmup_steps=100
)

In [22]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=test,
    compute_metrics=compute_metrics
)

In [23]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.559874,0.765522,0.801348,0.869001,0.765522
2,0.742100,0.460087,0.821247,0.841907,0.874924,0.821247
3,0.471300,0.739135,0.769593,0.806959,0.876398,0.769593




TrainOutput(global_step=1467, training_loss=0.4921753252129532, metrics={'train_runtime': 845.5652, 'train_samples_per_second': 55.433, 'train_steps_per_second': 1.735, 'total_flos': 1.2332652115746816e+16, 'train_loss': 0.4921753252129532, 'epoch': 3.0})

In [24]:
trainer.evaluate()



{'eval_loss': 0.7106401920318604,
 'eval_accuracy': 0.7791348600508906,
 'eval_f1': 0.8128753912922158,
 'eval_precision': 0.875301320898174,
 'eval_recall': 0.7791348600508906,
 'eval_runtime': 42.3902,
 'eval_samples_per_second': 185.42,
 'eval_steps_per_second': 5.803,
 'epoch': 3.0}

In [47]:
input_pred = ["This book was okay, not bad but not great either.",
              "This book was very worst and not good at all",
              "This book is very good"]
sentiment_labels = ["negative", "neutral", "positive"]

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)

model.eval()

for input in input_pred:
    # Tokenize the review
    inputs = tokenizer.encode_plus(
        input,
        max_length=512,
        truncation=True,
        padding='max_length',
        return_tensors='pt',
        add_special_tokens=True
    )

    # Move input tensors to the same device
    for key in inputs:
        inputs[key] = inputs[key].to(device)
    print(f"Sentiment for \"{input}\":")
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
    print(f"\tClass weights: {logits}")

    # Get the predicted class (0: Negative, 1: Neutral, 2: Positive)
    prediction = torch.argmax(logits, dim=-1).item()
    sentiment = sentiment_labels[prediction]
    print(f"\tSentiment : {sentiment}")
    print("\n")


Sentiment for "This book was okay, not bad but not great either.":
	Class weights: tensor([[-0.5386,  3.0430, -2.0915]], device='cuda:0')
	Sentiment : neutral


Sentiment for "This book was very worst and not good at all":
	Class weights: tensor([[ 4.2645, -2.1315, -2.6740]], device='cuda:0')
	Sentiment : negative


Sentiment for "This book is very good":
	Class weights: tensor([[-1.0197, -0.6811,  1.3815]], device='cuda:0')
	Sentiment : positive




In [31]:
!nvidia-smi

/bin/bash: /libraries/llm_gpu_mistral/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Mon Dec  2 18:16:10 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla V100-SXM2-16GB           Off | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P0              50W / 300W |  12451MiB / 16384MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+-

In [32]:
trainer.save_model("/data/sentiment_model")

In [33]:
connector.upload(name=connector_name, container=bucket_name, target_path="data_engg/sentiment_model/pytorch_model.bin", source_path="/data/sentiment_model/pytorch_model.bin")


In [34]:
connector.upload(name=connector_name, container=bucket_name, target_path="data_engg/sentiment_model/config.json", source_path="/data/sentiment_model/config.json")


In [35]:
connector.upload(name=connector_name, container=bucket_name, target_path="data_engg/sentiment_model/training_args.bin", source_path="/data/sentiment_model/training_args.bin")
