In [2]:
!pip install transformers
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [3]:
import pandas as pd

from sklearn.model_selection import train_test_split

from datasets import Dataset
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
df = pd.read_csv('/content/drive/MyDrive/amazon_reviews_project/sampled_book_df.csv')

In [6]:
df.columns

Index(['categories', 'ratingsCount', 'Title', 'Price', 'review/helpfulness',
       'review/score', 'review/time', 'review/summary', 'review/text'],
      dtype='object')

In [7]:
df = df.dropna()
df = df.assign(labels=df['review/score'].map(lambda score: 2 if score > 3.5 else (1 if score > 2.5 else 0)))

In [8]:
df['review/text'] = df['review/text'].astype(str)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6386 entries, 78 to 38885
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   categories          6386 non-null   object 
 1   ratingsCount        6386 non-null   int64  
 2   Title               6386 non-null   object 
 3   Price               6386 non-null   float64
 4   review/helpfulness  6386 non-null   float64
 5   review/score        6386 non-null   float64
 6   review/time         6386 non-null   object 
 7   review/summary      6386 non-null   object 
 8   review/text         6386 non-null   object 
 9   labels              6386 non-null   int64  
dtypes: float64(3), int64(2), object(5)
memory usage: 548.8+ KB


In [10]:
train_df, test_df = train_test_split(df, train_size=0.8, random_state=24, stratify = df['labels'])
rest_df, test_df = train_test_split(df, test_size=0.1, random_state=24, stratify = df['labels'])

In [11]:
df['labels'].value_counts()
class_2 = train_df[train_df['labels'] == 2]
class_0 = train_df[train_df['labels'] == 0]
class_1 = train_df[train_df['labels'] == 1]

In [12]:
min_class_size = min(class_0.shape[0], class_1.shape[0])
class_2_undersampled = class_2.sample(n=min_class_size, random_state=24)
class_0_undersampled = class_0.sample(n=min_class_size, random_state=24)
balanced_df = pd.concat([class_2_undersampled, class_0_undersampled, class_1])

balanced_df['labels'].value_counts()

Unnamed: 0_level_0,count
labels,Unnamed: 1_level_1
2,386
0,386
1,386


In [13]:
train_dataset = Dataset.from_pandas(balanced_df)
test_dataset = Dataset.from_pandas(test_df)

In [14]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [15]:
def tokenize(data):

    encoding = tokenizer.encode_plus(
        data['review/text'],
        max_length=512,
        truncation=True,
        padding='max_length',
        return_tensors='pt',
        add_special_tokens=True
    )

    return {
        'input_ids': encoding['input_ids'].squeeze(0),  # Remove batch dimension (if necessary)
        'attention_mask': encoding['attention_mask'].squeeze(0),  # Remove batch dimension (if necessary)
        'token_type_ids': encoding.get('token_type_ids', None).squeeze(0)  # Optional: only needed for some models
    }




In [16]:
train = train_dataset.map(tokenize, batched = False)
test =  test_dataset.map(tokenize,  batched = False)




Map:   0%|          | 0/1158 [00:00<?, ? examples/s]

Map:   0%|          | 0/639 [00:00<?, ? examples/s]

In [17]:
# Set the format to PyTorch tensors for easy model training
train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'token_type_ids', 'labels'])
test.set_format(type='torch', columns=['input_ids', 'attention_mask', 'token_type_ids', 'labels'])

In [18]:

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 3)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
training_args = TrainingArguments(
    output_dir='./results',
    report_to="none",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    warmup_steps=100
)



In [20]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=test
)

In [21]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.871139
2,No log,0.515587


TrainOutput(global_step=146, training_loss=0.8881673943506528, metrics={'train_runtime': 245.9875, 'train_samples_per_second': 9.415, 'train_steps_per_second': 0.594, 'total_flos': 609370675458048.0, 'train_loss': 0.8881673943506528, 'epoch': 2.0})

In [25]:
input_pred = ["This book was okay, not bad but not great either.",
              "This book was very worst and not good at all",
              "This book is very good"]
sentiment_labels = ["negative", "neutral", "positive"]

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)

model.eval()

for input in input_pred:
    # Tokenize the review
    inputs = tokenizer.encode_plus(
        input,
        max_length=512,
        truncation=True,
        padding='max_length',
        return_tensors='pt',
        add_special_tokens=True
    )

    # Move input tensors to the same device
    for key in inputs:
        inputs[key] = inputs[key].to(device)
    print(f"Sentiment for \"{input}\":")
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
    print(f"\tClass weights: {logits}")

    # Get the predicted class (0: Negative, 1: Neutral, 2: Positive)
    prediction = torch.argmax(logits, dim=-1).item()
    sentiment = sentiment_labels[prediction]
    print(f"\tSentiment : {sentiment}")
    print("\n")


Sentiment for "This book was okay, not bad but not great either.":
	Class weights: tensor([[-0.3220,  1.3985, -0.2509]], device='cuda:0')
	Sentiment : neutral


Sentiment for "This book was very worst and not good at all":
	Class weights: tensor([[ 1.9109,  0.0887, -1.4560]], device='cuda:0')
	Sentiment : negative


Sentiment for "This book is very good":
	Class weights: tensor([[-0.9817,  0.5461,  1.0170]], device='cuda:0')
	Sentiment : positive


