In [86]:
import torch
import bitsandbytes
import peft
import accelerate
import transformers
from transformers import AutoTokenizer, AutoModel, BitsAndBytesConfig, AutoModelForCausalLM, AutoModelForSeq2SeqLM, BertForSequenceClassification
from peft import prepare_model_for_kbit_training
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
from transformers import TrainingArguments, Trainer, TrainerCallback

from contractions import contractions_dict
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from collections import Counter, defaultdict

import torch
from torch.utils.data import Dataset, Subset, DataLoader, TensorDataset
import torch.nn.functional as F

from gensim.models import Word2Vec, KeyedVectors

In [100]:
model_id = "bert-large-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=10)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [101]:
# Prepare the model for training
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [103]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

print_trainable_parameters(model)

trainable params: 0 || all params: 109489930 || trainable%: 0.0


In [104]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8, # Attention head
    lora_alpha=32, # alpha scaling
    #target_modules=["q", "k", "v", "o"],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_2_SEQ"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 294912 || all params: 109784842 || trainable%: 0.2686272481951561


PREPARE DATA

In [118]:
import pandas as pd

# Import the data I will use
df_train = pd.read_csv('./Genre Classification Dataset/train_data.txt',sep=' ::: ',header=None,engine='python', names=['Title','Genre','Description'])
df_test = pd.read_csv('./Genre Classification Dataset/test_data_solution.txt',sep=' ::: ',header=None,engine='python', names=['Title','Genre','Description'])

# I'm going to merge them and do the splits myself
df = pd.concat([df_train, df_test])

# Create list for the genres I will use to clasiffy
genres = ['drama', 'documentary', 'comedy', 'adventure', 'horror', 'thriller',
       'action', 'western', 'reality-tv', 'family']

# Short the dataframe for the top 10 most used genres
df = df[df["Genre"].isin(genres)]
df = df.reset_index(drop=True)

# Now I will balance the dataset to 1550 samples
# Create a new df
df_balanced= pd.DataFrame()

# Loop through each genre and sample 1550 rows
for genre in df['Genre'].unique():
    genre_subset = df[df['Genre'] == genre]
    samples = genre_subset.sample(n=min(len(genre_subset), 1550), random_state=42)
    df_balanced = pd.concat([df_balanced, samples])

# Reset the index of the new DataFrame
df_balanced = df_balanced.reset_index(drop=True)

# Get the descriptions
input = df_balanced["Description"].values
output = df_balanced["Genre"].values

In [169]:
# I want to only keep movie descriptions between 20 to 60 words
# I will need this variables
inputs = []
labels = []
index = []
i = 0

# Initialize a dictionary to count genres
genre_counts = {}

# Iterate through each description and its corresponding genre
for description, genre in zip(input, output):
    # Check if the length of the description is between ?
    if 5 <= len(description) <= 512:
        # Update genre count and check if less than minimum per genre
        if genre_counts.get(genre, 0) < 740:
            inputs.append(description)
            labels.append(genre)
            genre_counts[genre] = genre_counts.get(genre, 0) + 1
            index.append(i)
    i+=1

In [140]:
import numpy as np

# Now we can do the splits for the data
# Number of classes
num_classes = 10

# Number of samples per class
samples_per_class = 740

# Splits for train, validate, and test
train_split = int(0.5 * samples_per_class)
val_split = int(0.3 * samples_per_class)
test_split = 1550 - train_split - val_split

# Create lists to hold indices for each set
train_indices, val_indices, test_indices = [], [], []

# Set random state
rng = np.random.default_rng(seed=42)

# Split indices for each class
for class_index in range(num_classes):
    start_index = class_index * samples_per_class
    indices = list(range(start_index, start_index + samples_per_class))
    rng.shuffle(indices)
    train_indices.extend(indices[:train_split])
    val_indices.extend(indices[train_split:train_split + val_split])
    test_indices.extend(indices[train_split + val_split:])


DATASET to LOAD

In [196]:
train_inputs =[inputs[i] for i in train_indices]
train_labels =[labels[i] for i in train_indices]

val_inputs =[inputs[i] for i in val_indices]
val_labels =[labels[i] for i in val_indices]

test_inputs =[inputs[i] for i in test_indices]
test_labels =[labels[i] for i in test_indices]

In [197]:
from datasets import Dataset

train_dataset = Dataset.from_dict({'input': train_inputs, 'label': train_labels})
val_dataset = Dataset.from_dict({'input': val_inputs, 'label': val_labels})
test_dataset = Dataset.from_dict({'input': test_inputs, 'label': test_labels})


In [198]:
from datasets import DatasetDict

dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})


In [180]:
from huggingface_hub import notebook_login

notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [199]:
# Assuming 'dataset_dict' is your DatasetDict
dataset_dict.push_to_hub("ManuelAlv/test")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/539 [00:00<?, ?B/s]

In [188]:
dataset2 = load_dataset("ManuelAlv/test")

Downloading readme:   0%|          | 0.00/539 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.75M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/714k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/3700 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2220 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1480 [00:00<?, ? examples/s]

In [194]:
dataset2['train']['label']

['Karl Achilles has been working at the Bitterfeld chemical plant ever since he helped build it up in 1945. Now he is 65 years old and the time has come for him to retire. The movie depicts his last day at work, and allows for an exemplary glimpse into the working conditions of the former German Democratic Republic as well as into the head of a dedicated worker whose time has come to make room for the next generation.',
 'L.I.T follows the young and naive Nehemiah "Nemo" as he journeys from Augusta, GA to Atlanta with aspirations to become a model. After getting accepted into the modeling agency, Nemo is introduced to Gorgious who runs a local strip club. Nemo gets sucked into the life and starts working there and his life begins crashing all around him.',
 "Harry Munter, a sensitive, kind, appealing man in his twenties, lives with his parents. He's an inventor, a bit of a mystic, maybe a genius, and a good son and grandson. He's offered work in the U.S. But a friend has cancer and the

NO DATASET USAGE

In [133]:
# add prefix to inputs
inputs = ["Classify the following movie: " + inp for inp in inputs]

# finally we can tokenize the inputs and targets
model_inputs = tokenizer(inputs, max_length=512, padding="max_length", truncation=True, return_tensors="pt")

# Step 1: Convert string labels to indices
label_to_index = defaultdict(lambda: len(label_to_index))
numerical_labels = [label_to_index[label] for label in labels]
labels = torch.tensor(numerical_labels)

# Create a dataset
dataset = TensorDataset(model_inputs['input_ids'], model_inputs['attention_mask'], model_inputs['token_type_ids'], labels)

# Create a DataLoader
#data_loader = DataLoader(dataset, batch_size=4, shuffle=True)

In [195]:
label_to_index

defaultdict(<function __main__.<lambda>()>,
            {'drama': 0,
             'thriller': 1,
             'documentary': 2,
             'comedy': 3,
             'reality-tv': 4,
             'horror': 5,
             'action': 6,
             'adventure': 7,
             'western': 8,
             'family': 9})

In [135]:
# Define the batch size
bs = 4

# Create Subset datasets for each split
train_dataset = Subset(dataset, train_indices)
validate_dataset = Subset(dataset, val_indices)
test_dataset = Subset(dataset, test_indices)

# Create DataLoader objects for each split, with drop_last to ensure all batches have a size of 64 (Small amount of data loss)
train_loader = DataLoader(train_dataset, batch_size=bs, shuffle=True, drop_last=True)
val_loader = DataLoader(validate_dataset, batch_size=bs, shuffle=True, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=bs, shuffle=True, drop_last=True)

TRAINING MANUAL 

In [109]:
# Assuming 'model' is already defined and is a classification model
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.0001, weight_decay=5e-4)
num_epochs = 1

# Training loop
for epoch in tqdm(range(num_epochs), desc="Training"):
    model.train()
    running_loss = 0.0
    correct_train = 0
    total_train = 0

    for batch in train_loader:
        input_ids, attention_mask, token_type_ids, labels = batch
        optimizer.zero_grad()

        # Forward pass. For classification, the model outputs logits
        outputs = model(input_ids, attention_mask=attention_mask).logits

        # Compute loss - CrossEntropyLoss expects raw logits as the model output
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

        # Calculate accuracy
        _, predicted = torch.max(outputs, 1)
        total_train += labels.size(0)
        correct_train += (predicted == labels).sum().item()

    train_accuracy = 100 * correct_train / total_train

    # Print training statistics
    tqdm.write(f'Epoch {epoch + 1}/{num_epochs}, '
               f'Training Loss: {running_loss/len(data_loader):.4f}, '
               f'Training Accuracy: {train_accuracy:.2f}%\n'
               )

Training:   0%|          | 0/1 [00:00<?, ?it/s]

Training: 100%|██████████| 1/1 [07:45<00:00, 465.97s/it]

Epoch 1/1, Training Loss: 2.4179, Training Accuracy: 11.00%






TEST