In [1]:
import pandas as pd
import torch
import os
import numpy as np
# from tqdm import tqdm
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, get_scheduler
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

2024-07-28 16:26:33.537467: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-28 16:26:33.537523: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-28 16:26:33.538936: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Load Data

In [2]:
# def main(filepath, model_name, batchsize=32, lr=3e-5, num_epochs=3):
# Load Data
filepath= "/kaggle/input/transcription/final_transcription.csv"
data = pd.read_csv(filepath)
data

Unnamed: 0.1,Unnamed: 0,Transcription,Sentiment
0,0,Today's feature on home gardening channels exp...,Positive
1,1,Our examination of the impact of gaming livest...,Negative
2,2,This week's feature on pet rescue channels exp...,Positive
3,3,Our report on the rise of unboxing videos exam...,Negative
4,4,Today's feature on educational science channel...,Positive
...,...,...,...
543,493,Our report on the rise of conspiracy theory vi...,Negative
544,494,Today's feature on language learning channels ...,Positive
545,495,Our examination of the impact of fitness influ...,Negative
546,496,This week's feature on tiny house living explo...,Positive


# Preprocess Data

In [3]:
data.drop(columns=["Unnamed: 0"], inplace=True)

label = {"Negative": 0, "Positive": 1}
data['Sentiment'] = data['Sentiment'].map(label)

# Create Datasets
df_hf = Dataset.from_pandas(data)
df_hf = df_hf.train_test_split(test_size=0.1)
df_hf["validation"] = df_hf.pop("test")
df_hf

DatasetDict({
    train: Dataset({
        features: ['Transcription', 'Sentiment'],
        num_rows: 493
    })
    validation: Dataset({
        features: ['Transcription', 'Sentiment'],
        num_rows: 55
    })
})

In [4]:
def load_tokenizer_and_model(model_name, num_labels):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    if tokenizer.pad_token is None:
        if tokenizer.eos_token is not None:
            tokenizer.pad_token = tokenizer.eos_token
        else:
            tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        tokenizer.model_max_length = 512  # Adjust as needed
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
    model.config.pad_token_id = tokenizer.pad_token_id  # Set pad_token_id in the model configuration
    model.resize_token_embeddings(len(tokenizer))  # Resize the token embeddings to match the new tokenizer length
    return tokenizer, model

In [5]:

def prepare_data_loaders(df_hf, tokenizer, batchsize):
    def tokenize_function(batch):
        return tokenizer(batch['Transcription'], truncation=True, padding=True)
    
    tokenized_df_hf = df_hf.map(tokenize_function, batched=True, batch_size=batchsize)
    tokenized_df_hf = tokenized_df_hf.remove_columns(['Transcription'])
    tokenized_df_hf = tokenized_df_hf.rename_column("Sentiment", "labels")
    tokenized_df_hf = tokenized_df_hf.with_format('torch')
    
    collator = DataCollatorWithPadding(tokenizer=tokenizer)
    
    train_dataloader = DataLoader(
        dataset=tokenized_df_hf['train'],
        batch_size=batchsize,
        shuffle=True,
        collate_fn=collator
    )
    
    val_dataloader = DataLoader(
        dataset=tokenized_df_hf['validation'],
        batch_size=batchsize,
        collate_fn=collator
    )
    
    return train_dataloader, val_dataloader

In [6]:
def train_and_evaluate(model, train_dataloader, val_dataloader, num_epochs, lr):
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model.to(device)
    
    os.environ["TOKENIZERS_PARALLELISM"] = "false"
    
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    num_of_steps = num_epochs * len(train_dataloader)
    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_of_steps
    )
    
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            train_loss += loss.item()
        
        avg_train_loss = train_loss / len(train_dataloader)
        
        model.eval()
        val_loss = 0
        preds = []
        labels = []
        with torch.no_grad():
            for batch in val_dataloader:
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)
                loss = outputs.loss
                val_loss += loss.item()
                preds.append(outputs.logits.cpu().numpy())
                labels.append(batch['labels'].cpu().numpy())
        
        preds = np.concatenate(preds, axis=0)
        labels = np.concatenate(labels, axis=0)
        avg_val_loss = val_loss / len(val_dataloader)
        accuracy, precision, recall, f1 = compute_metrics(preds, labels)
        
        print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")
        print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")

In [7]:
def compute_metrics(preds, labels):
    preds = np.argmax(preds, axis=1)
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted', zero_division=1)
    return accuracy, precision, recall, f1

In [None]:
# Example usage
model_name = 'bert-base-uncased'
num_labels = 2
batchsize = 16
num_epochs = 3
lr = 2e-5

tokenizer, model = load_tokenizer_and_model(model_name, num_labels)
train_dataloader, val_dataloader = prepare_data_loaders(df_hf, tokenizer, batchsize)
train_and_evaluate(model, train_dataloader, val_dataloader, num_epochs, lr)


Epoch 1/3, Train Loss: 0.5047, Val Loss: 0.2651

Accuracy: 0.9273, Precision: 0.9286, Recall: 0.9273, F1 Score: 0.9266

Epoch 2/3, Train Loss: 0.2437, Val Loss: 0.1777

Accuracy: 0.9455, Precision: 0.9457, Recall: 0.9455, F1 Score: 0.9452

Epoch 3/3, Train Loss: 0.1650, Val Loss: 0.1816

Accuracy: 0.9455, Precision: 0.9457, Recall: 0.9455, F1 Score: 0.9452

## GPT2

In [None]:
# Example usage
model_name = 'distilbert/distilgpt2'
num_labels = 2
batchsize = 16
num_epochs = 3
lr = 2e-5

tokenizer, model = load_tokenizer_and_model(model_name, num_labels)
train_dataloader, val_dataloader = prepare_data_loaders(df_hf, tokenizer, batchsize)
train_and_evaluate(model, train_dataloader, val_dataloader, num_epochs, lr)


Epoch 1/3, Train Loss: 0.6132, Val Loss: 0.3058

Accuracy: 0.8909, Precision: 0.8909, Recall: 0.8909, F1 Score: 0.8909

Epoch 2/3, Train Loss: 0.2611, Val Loss: 0.1862

Accuracy: 0.9273, Precision: 0.9289, Recall: 0.9273, F1 Score: 0.9268

Epoch 3/3, Train Loss: 0.2095, Val Loss: 0.1860

Accuracy: 0.9273, Precision: 0.9289, Recall: 0.9273, F1 Score: 0.9268

## Bart 

In [None]:
# Example usage
model_name = 'facebook/bart-base'
num_labels = 2
batchsize = 16
num_epochs = 3
lr = 2e-5

tokenizer, model = load_tokenizer_and_model(model_name, num_labels)
train_dataloader, val_dataloader = prepare_data_loaders(df_hf, tokenizer, batchsize)
train_and_evaluate(model, train_dataloader, val_dataloader, num_epochs, lr)


Epoch 1/3, Train Loss: 0.3634, Val Loss: 0.3105

Accuracy: 0.9273, Precision: 0.9273, Recall: 0.9273, F1 Score: 0.9273

Epoch 2/3, Train Loss: 0.2060, Val Loss: 0.2264

Accuracy: 0.9273, Precision: 0.9273, Recall: 0.9273, F1 Score: 0.9273

Epoch 3/3, Train Loss: 0.1469, Val Loss: 0.1995

Accuracy: 0.9455, Precision: 0.9463, Recall: 0.9455, F1 Score: 0.9456

## Flan T5

In [14]:
# Example usage
model_name = 'google/flan-t5-base'
num_labels = 2
batchsize = 8
num_epochs = 10
lr = 2e-5

tokenizer, model = load_tokenizer_and_model(model_name, num_labels)
train_dataloader, val_dataloader = prepare_data_loaders(df_hf, tokenizer, batchsize)
train_and_evaluate(model, train_dataloader, val_dataloader, num_epochs, lr)


Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at google/flan-t5-base and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/493 [00:00<?, ? examples/s]

Map:   0%|          | 0/55 [00:00<?, ? examples/s]

Epoch 1/10, Train Loss: 0.5665, Val Loss: 0.2344
Accuracy: 0.9273, Precision: 0.9296, Recall: 0.9273, F1 Score: 0.9272
Epoch 2/10, Train Loss: 0.2709, Val Loss: 0.1046
Accuracy: 0.9455, Precision: 0.9461, Recall: 0.9455, F1 Score: 0.9455
Epoch 3/10, Train Loss: 0.2287, Val Loss: 0.0752
Accuracy: 0.9636, Precision: 0.9661, Recall: 0.9636, F1 Score: 0.9636
Epoch 4/10, Train Loss: 0.1712, Val Loss: 0.0693
Accuracy: 0.9818, Precision: 0.9825, Recall: 0.9818, F1 Score: 0.9818
Epoch 5/10, Train Loss: 0.1521, Val Loss: 0.0876
Accuracy: 0.9455, Precision: 0.9509, Recall: 0.9455, F1 Score: 0.9453
Epoch 6/10, Train Loss: 0.1276, Val Loss: 0.0755
Accuracy: 0.9818, Precision: 0.9824, Recall: 0.9818, F1 Score: 0.9818
Epoch 7/10, Train Loss: 0.0976, Val Loss: 0.0711
Accuracy: 0.9818, Precision: 0.9824, Recall: 0.9818, F1 Score: 0.9818
Epoch 8/10, Train Loss: 0.1176, Val Loss: 0.0743
Accuracy: 0.9636, Precision: 0.9636, Recall: 0.9636, F1 Score: 0.9636
Epoch 9/10, Train Loss: 0.0920, Val Loss: 0.0734

In [15]:
def save_model(model, tokenizer, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"Model and tokenizer saved to {output_dir}")
    
# Save the model and tokenizer
save_model(model, tokenizer, "model_ver2")

Model and tokenizer saved to model_ver2


In [11]:
# Load the saved model and tokenizer
model_name = "bert-base-uncased"
model_dir = "/kaggle/working/\."
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSequenceClassification.from_pretrained(model_dir)

# Define the inference function
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predicted_class_id = torch.argmax(logits, dim=1).item()
    return "Positive" if predicted_class_id == 1 else "Negative"


In [13]:
test_text = """The new product launched by the company has been receiving rave reviews from customers. 
They are particularly impressed with the innovative features and the user-friendly interface. 
Many users have reported that the product has significantly improved their productivity and efficiency. 
The company's customer service has also been praised for their prompt and helpful responses to queries. 
Overall, the launch has been a huge success, with sales exceeding initial projections and positive feedback pouring in from all quarters. 
It's clear that the company has managed to meet the needs and expectations of its customers with this latest offering."""

predict_sentiment(test_text)

'Positive'

In [None]:

# Load Pre-trained Tokenizer and Model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    # Tokenization Function
    def tokenize_function(batch):
        return tokenizer(batch['Transcription'], truncation=True, padding=True)

    # Prepare Data Loaders
    tokenized_df_hf = df_hf.map(tokenize_function, batched=True, batch_size=batchsize)
    tokenized_df_hf = tokenized_df_hf.remove_columns(['Transcription'])
    tokenized_df_hf = tokenized_df_hf.rename_column("Sentiment", "labels")
    tokenized_df_hf = tokenized_df_hf.with_format('torch')

    collator = DataCollatorWithPadding(tokenizer=tokenizer)

    train_dataloader = DataLoader(
        dataset=tokenized_df_hf['train'],
        batch_size=batchsize,
        shuffle=True,
        collate_fn=collator
    )

    val_dataloader = DataLoader(
        dataset=tokenized_df_hf['validation'],
        batch_size=batchsize,
        collate_fn=collator
    )

    # Set Device
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model.to(device)

    # Set Parallelism Environment Variable
    os.environ["TOKENIZERS_PARALLELISM"] = "false"

    # Define Optimizer and Scheduler
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    num_of_steps = num_epochs * len(train_dataloader)
    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_of_steps
    )

# Train Model
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            train_loss += loss.item()
            progress.update()
        
        avg_train_loss = train_loss / len(train_dataloader)
        
        # Evaluate Model
        model.eval()
        val_loss = 0
        preds = []
        labels = []
        with torch.no_grad():
            for batch in val_dataloader:
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)
                loss = outputs.loss
                val_loss += loss.item()
                preds.append(outputs.logits.cpu().numpy())
                labels.append(batch['labels'].cpu().numpy())
        
        preds = np.concatenate(preds, axis=0)
        labels = np.concatenate(labels, axis=0)
        avg_val_loss = val_loss / len(val_dataloader)
        accuracy, precision, recall, f1 = compute_metrics(preds, labels)
    
        
        print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")
        print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")

def compute_metrics(preds, labels):
    preds = np.argmax(preds, axis=1)
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted', zero_division=1)
    return accuracy, precision, recall, f1
