# Setup / Clear Kernel Memory

In [1]:
import gc
def clear_memory():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

In [2]:
import pandas as pd
import numpy as np
df = pd.read_csv('df_final.csv')

In [3]:
df['Transcription'] = df['Transcription'].fillna('').astype(str)  

In [4]:
df.head(3)

Unnamed: 0,Index,Link,Label,Text,FileName,Transcription,sort_key
0,1.0,https://www.tiktok.com/@1tashyat/video/7359361...,1,"""ST Anselm College. She's a Republican. This w...",1_mp4_trial_2.json,ST Anselm College. She's a Republican. This wi...,1
1,2.0,https://www.tiktok.com/@monkeman317/video/7357...,1,"""Said though, that if you did run for presiden...",2_mp4_trial_2.json,"Said though, that if you did run for president...",2
2,3.0,https://www.tiktok.com/@bwtgrils_/video/736257...,1,Yeah.',3_mp4_trial_2.json,Yeah.,3


# BERT Model

In [3]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from tqdm import tqdm

model_name = "distilbert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)  # Assuming three classes
tokenizer = AutoTokenizer.from_pretrained(model_name)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, model):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.model = model

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Ensure the text is a string
        if not isinstance(text, str):
            text = str(text)

        # Tokenize the text
        encoded_text = self.tokenizer(text, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
        encoded_text = {key: value.squeeze(0) for key, value in encoded_text.items()}
        return encoded_text, torch.tensor(label, dtype=torch.long)

    def process_in_batches(self, batch_size, optimizer, scheduler):
        dataloader = DataLoader(self, batch_size=batch_size, shuffle=True, pin_memory=True)
        all_outputs = []
        self.model.train()

        for inputs, labels in dataloader:
            inputs = {key: value.to(device) for key, value in inputs.items()}
            labels = labels.to(device)
            
            optimizer.zero_grad()
            outputs = self.model(**inputs, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            scheduler.step()
            all_outputs.append(outputs.logits.detach().cpu().numpy())

            # Clear unnecessary data
            del inputs, labels, outputs
            torch.cuda.empty_cache()

        return np.concatenate(all_outputs, axis=0)

def split_data(texts, labels, chunk_size):
    for i in range(0, len(texts), chunk_size):
        yield texts[i:i + chunk_size], labels[i:i + chunk_size]

df['Text'].fillna('', inplace=True) 
df['Label'].fillna(0, inplace=True)  
train_texts, val_texts, train_labels, val_labels = train_test_split(df['Text'].tolist(), df['Label'].tolist(), test_size=0.2, random_state=42)

chunk_size = 20  
batch_size = 8 
learning_rate = 1e-5  
num_epochs = 3 
step_size = 3 
gamma = 0.8 

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)  # Use AdamW optimizer
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma)

all_train_outputs = []
all_val_outputs = []

for train_chunk_texts, train_chunk_labels in tqdm(split_data(train_texts, train_labels, chunk_size), desc="Processing Train Data"):
    train_dataset = SentimentDataset(train_chunk_texts, train_chunk_labels, tokenizer, model)
    train_outputs = train_dataset.process_in_batches(batch_size, optimizer, scheduler)
    all_train_outputs.append(train_outputs)

for val_chunk_texts, val_chunk_labels in tqdm(split_data(val_texts, val_labels, chunk_size), desc="Processing Validation Data"):
    val_dataset = SentimentDataset(val_chunk_texts, val_chunk_labels, tokenizer, model)
    val_outputs = val_dataset.process_in_batches(batch_size, optimizer, scheduler)
    all_val_outputs.append(val_outputs)

all_train_outputs = np.concatenate(all_train_outputs, axis=0)
all_val_outputs = np.concatenate(all_val_outputs, axis=0)

def evaluate_model(outputs, labels):
    predicted_labels = np.argmax(outputs, axis=1)
    accuracy = np.mean(predicted_labels == labels)
    return accuracy

train_accuracy = evaluate_model(all_train_outputs, np.array(train_labels))
val_accuracy = evaluate_model(all_val_outputs, np.array(val_labels))

print("Training accuracy: {:.2f}%".format(train_accuracy * 100))
print("Validation accuracy: {:.2f}%".format(val_accuracy * 100))

2024-08-04 21:55:42.280592: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Processing Train Data: 35it [06:24, 10.99s/it]
Processing Validation Data: 9it [01:31, 10.20s/it]

Training accuracy: 41.90%
Validation accuracy: 43.60%





In [5]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from tqdm import tqdm

model_name = "distilbert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3) 
tokenizer = AutoTokenizer.from_pretrained(model_name)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoded_text = self.tokenizer(text, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
        encoded_text = {key: value.squeeze(0) for key, value in encoded_text.items()}
        return encoded_text, torch.tensor(label, dtype=torch.long)

def process_in_batches(texts, labels, batch_size):
    dataset = SentimentDataset(texts, labels, tokenizer)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, pin_memory=True)

    all_outputs = []
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

    for epoch in range(1): 
        for batch in dataloader:
            inputs, labels = batch
            inputs = {key: value.to(device) for key, value in inputs.items()}
            labels = labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(**inputs, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            scheduler.step()
            all_outputs.append(outputs.logits.detach().cpu().numpy())

            # Clear unnecessary data
            del inputs, labels, outputs
            torch.cuda.empty_cache()

    return np.concatenate(all_outputs, axis=0)

train_texts, val_texts, train_labels, val_labels = train_test_split(df['Transcription'].tolist(), df['Label'].tolist(), test_size=0.2, random_state=42)

def split_data(texts, labels, chunk_size):
    for i in range(0, len(texts), chunk_size):
        yield texts[i:i + chunk_size], labels[i:i + chunk_size]

chunk_size = 1  
batch_size = 1

all_train_outputs = []
all_val_outputs = []

for train_chunk_texts, train_chunk_labels in tqdm(split_data(train_texts, train_labels, chunk_size), desc="Processing Train Data"):
    train_outputs = process_in_batches(train_chunk_texts, train_chunk_labels, batch_size)
    all_train_outputs.append(train_outputs)

for val_chunk_texts, val_chunk_labels in tqdm(split_data(val_texts, val_labels, chunk_size), desc="Processing Validation Data"):
    val_outputs = process_in_batches(val_chunk_texts, val_chunk_labels, batch_size)
    all_val_outputs.append(val_outputs)

all_train_outputs = np.concatenate(all_train_outputs, axis=0)
all_val_outputs = np.concatenate(all_val_outputs, axis=0)

def evaluate_model(outputs, labels):
    predicted_labels = np.argmax(outputs, axis=1)
    accuracy = np.mean(predicted_labels == labels)
    return accuracy

train_accuracy = evaluate_model(all_train_outputs, np.array(train_labels))
val_accuracy = evaluate_model(all_val_outputs, np.array(val_labels))

print("Training accuracy: {:.2f}%".format(train_accuracy * 100))
print("Validation accuracy: {:.2f}%".format(val_accuracy * 100))

2024-08-04 22:14:05.111329: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Processing Train Data: 685it [15:35,  1.36s/it]
Processing Validation Data: 172it [03:42,  1.30s/it]

Training accuracy: 77.37%
Validation accuracy: 79.07%





# Save the Model

In [6]:
import torch

In [7]:
model_path = 'bert_model.pth'
torch.save(model.state_dict(), model_path)
print(f"Model saved to {model_path}")

Model saved to bert_model.pth
