In [None]:
from sklearn.model_selection import train_test_split

# Assuming that 'dataset' is a pandas DataFrame
train_dataset, val_dataset = train_test_split(dataset, test_size=0.2)

print(f"Training dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")


Training dataset size: 11
Validation dataset size: 11


In [None]:
print("Unique labels in training dataset: ", train_dataset['label'].unique())
print("Unique labels in validation dataset: ", val_dataset['label'].unique())

Unique labels in training dataset:  tensor([0, 1, 2])
Unique labels in validation dataset:  tensor([0, 1, 2])


In [8]:
from sklearn.metrics import classification_report

# Evaluation mode
model.eval()

# Variables to gather full output
true_labels, pred_labels = [], []

for batch in val_dataloader:
    input_ids, attention_mask, labels = batch
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    # Move logits and labels to CPU
    logits = outputs.logits.detach().cpu().numpy()
    label_ids = labels.to('cpu').numpy()

    # Store predictions and true labels
    pred_labels += list(np.argmax(logits, axis=1))
    true_labels += list(label_ids)

print(classification_report(true_labels, pred_labels, target_names=label_encoder.classes_))

              precision    recall  f1-score   support

    negative       0.39      0.91      0.54       177
     neutral       0.33      0.01      0.03        76
    positive       0.48      0.10      0.17       210

    accuracy                           0.40       463
   macro avg       0.40      0.34      0.24       463
weighted avg       0.42      0.40      0.29       463



In [10]:
from transformers import AlbertForSequenceClassification, AlbertTokenizerFast, AdamW
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import pandas as pd
import torch
import numpy as np

# Load your dataset
df = pd.read_csv('Laptop_Train_v2.csv')
# Drop rows where the label is 'conflict'
df = df[df['polarity'] != 'conflict']

label_encoder = LabelEncoder()
df['polarity'] = label_encoder.fit_transform(df['polarity'])

# Split the DataFrame into training and testing sets
train_df, val_df = train_test_split(df, test_size=0.2)

# Load the pre-trained model and tokenizer
model = AlbertForSequenceClassification.from_pretrained("albert-base-v2", num_labels=3)
tokenizer = AlbertTokenizerFast.from_pretrained("albert-base-v2")

# Combine the sentence and aspect term into a single string
train_texts = (train_df['Sentence'] + " [SEP] " + train_df['Aspect Term']).tolist()
val_texts = (val_df['Sentence'] + " [SEP] " + val_df['Aspect Term']).tolist()

# Tokenize your datasets
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

# Create PyTorch Datasets
train_dataset = list(zip(torch.tensor(train_encodings['input_ids']), torch.tensor(train_encodings['attention_mask']), torch.tensor(train_df['polarity'].tolist())))
val_dataset = list(zip(torch.tensor(val_encodings['input_ids']), torch.tensor(val_encodings['attention_mask']), torch.tensor(val_df['polarity'].tolist())))

# Create DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

# Training loop
for epoch in range(1):
    total_loss = 0
    model.train()
    for batch in train_dataloader:
        optimizer.zero_grad()
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch: {epoch}, Loss: {total_loss / len(train_dataloader)}")

# Evaluation mode
model.eval()

# Variables to gather full output
true_labels, pred_labels = [], []

for batch in val_dataloader:
    input_ids, attention_mask, labels = batch
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    # Move logits and labels to CPU
    logits = outputs.logits.detach().cpu().numpy()
    label_ids = labels.to('cpu').numpy()

    # Store predictions and true labels
    pred_labels += list(np.argmax(logits, axis=1))
    true_labels += list(label_ids)

print(classification_report(true_labels, pred_labels, target_names=label_encoder.classes_))

# Save the model
model.save_pretrained('albert')

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]



Epoch: 0, Loss: 0.7630517542490671
              precision    recall  f1-score   support

    negative       0.81      0.84      0.82       183
     neutral       0.50      0.63      0.56        81
    positive       0.89      0.77      0.83       199

    accuracy                           0.77       463
   macro avg       0.73      0.75      0.74       463
weighted avg       0.79      0.77      0.78       463

