In [1]:
import torch
import torch.nn as nn
from transformers import AutoModel


class FineTuneClassifier(nn.Module):
    def __init__(self, base_model_path: str, num_labels: int) -> None:
        super(FineTuneClassifier, self).__init__()
        self.base_model = AutoModel.from_pretrained(base_model_path)

        for param in self.base_model.parameters():
            param.requires_grad = False

        self.classifier = nn.Linear(self.base_model.config.hidden_size * 2, num_labels)

    @classmethod
    def from_classifier_head(
        cls, base_model_path: str, path: str, num_labels: int
    ) -> nn.Module:
        model = cls(base_model_path, num_labels)
        model.classifier.load_state_dict(torch.load(path))
        return model

    def forward(
        self, input_ids: torch.tensor, attention_mask: torch.tensor
    ) -> torch.tensor:
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        B, T, C = outputs.logits.shape

        all_tokens_hidden = outputs.logits  # (B, T, C)
        last_token_hidden = outputs.logits[:, -1, :]  # (B, C)
        last_token_hidden = last_token_hidden.unsqueeze(1).expand(B, T, C)

        combined_representation = torch.cat(
            (all_tokens_hidden, last_token_hidden), dim=-1
        )
        logits = self.classifier(combined_representation)
        return logits


class BaselineClassifier(nn.Module):
    def __init__(
        self,
        d_model: int,
        num_layers: int,
        nhead: int,
        max_seq_length: int,
        vocab_size: int,
        pad_token_id: int,
        num_labels: int,
    ) -> None:
        super(BaselineClassifier, self).__init__()
        self.pad_token_id = pad_token_id
        self.token_embedding = nn.Embedding(
            vocab_size, d_model, padding_idx=pad_token_id
        )
        self.pos_embedding = nn.Embedding(max_seq_length, d_model)
        decoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=nhead, batch_first=True
        )
        self.transformer = nn.TransformerEncoder(decoder_layer, num_layers=num_layers)
        self.classifier = nn.Linear(d_model * 2, num_labels)

    def forward(self, token_ids: torch.tensor) -> torch.tensor:
        batch_size, seq_len = token_ids.shape

        token_emb = self.token_embedding(token_ids)
        pos_ids = torch.arange(seq_len, device=token_ids.device).unsqueeze(0)
        pos_emb = self.pos_embedding(pos_ids)
        embeddings = token_emb + pos_emb

        causal_mask = torch.triu(
            torch.ones(seq_len, seq_len, device=token_ids.device, dtype=torch.bool),
            diagonal=1,
        )

        pad_mask = token_ids.eq(self.pad_token_id)  # shape: (batch_size, seq_len)

        output = self.transformer(
            embeddings, mask=causal_mask, src_key_padding_mask=pad_mask
        )

        B, T, C = output.shape
        all_tokens_hidden = output  # (B, T, C)
        last_token_hidden = output[:, -1, :]  # (B, C)
        last_token_hidden = last_token_hidden.unsqueeze(1).expand(B, T, C)

        combined_representation = torch.cat(
            (all_tokens_hidden, last_token_hidden), dim=-1
        )
        logits = self.classifier(combined_representation)
        return logits


In [54]:
from typing import Dict
BASELINE_MODELS: Dict[str, Dict[str, int]] = {
    "mini": {
        "d_model": 64,
        "num_layers": 4,
        "num_heads": 4,
        "max_len": 16_384,
    },
    "small": {
        "d_model": 510,
        "num_layers": 8,
        "num_heads": 6,
        "max_len": 16_384,
    },
    "medium": {
        "d_model": 1344,
        "num_layers": 24,
        "num_heads": 16,
        "max_len": 16_384,
    },
    "large": {
        "d_model": 1824,
        "num_layers": 36,
        "num_heads": 24,
        "max_len": 16_384,
    },
}

In [55]:
tmp()

Model: mini, Total Parameters: 10.49M, % emb parameters: 0.7928761601769944
Active params: 2.17M, emb params: 8.32M
Used VRAM: 3461.45 MB
Model: small, Total Parameters: 99.75M, % emb parameters: 0.6646892012547166
Active params: 33.45M, emb params: 66.30M
Used VRAM: 3802.29 MB
Model: medium, Total Parameters: 502.61M, % emb parameters: 0.34762279350401076
Active params: 327.89M, emb params: 174.72M
Used VRAM: 5349.83 MB
Model: large, Total Parameters: 1015.72M, % emb parameters: 0.23345000116597012
Active params: 778.60M, emb params: 237.12M
Used VRAM: 7296.51 MB


In [20]:
def tmp():
    for name, config in BASELINE_MODELS.items():
        d_model = config["d_model"]
        num_layers = config["num_layers"]
        nhead = config["num_heads"]
        max_seq_length = config["max_len"]
        vocab_size = 130_000
        pad_token_id = 0
        num_labels = 2

        model = BaselineClassifier(
            d_model,
            num_layers,
            nhead,
            max_seq_length,
            vocab_size,
            pad_token_id,
            num_labels,
        )
        total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        emb_params = vocab_size * d_model

        model.cuda()
        used_memory = torch.cuda.memory_allocated(device=torch.device("cuda:0"))
        print(f"Model: {name}, Total Parameters: {total_params / 1e6:.2f}M, % emb parameters: {emb_params/total_params}")
        print(f"Active params: {(total_params - emb_params) / 1e6:.2f}M, emb params: {emb_params / 1e6:.2f}M")
        print(f"Used VRAM: {used_memory / (1024 ** 2):.2f} MB")

In [1]:
import os
def get_csv_paths(folder_path, recursive=False):
    if recursive:
        # Walk through all subdirectories
        file_paths = [os.path.join(root, file) 
                      for root, _, files in os.walk(folder_path) 
                      for file in files if file.endswith('.csv')]
    else:
        # Get files in the root folder only
        file_paths = [os.path.join(folder_path, file) 
                      for file in os.listdir(folder_path) 
                      if file.endswith('.csv')]
    
    return file_paths

In [2]:
DATA_HUMAN_PATH = "../data/data_human/"
DATA_AI_PATH = "../data/data_ai/"

In [3]:
paths = get_csv_paths(DATA_AI_PATH, recursive=True) + get_csv_paths(DATA_HUMAN_PATH)

In [4]:
import pandas as pd

In [5]:
for path in paths:
    print(path)
    sample = pd.read_csv(path).sample(1)
    print(sample["text"].tolist()[0])
    print("-" * 20)

../data/data_ai/blogs/blogs_Phi-3-small-128k-instruct.csv
My haircut appointment is set for 3:00 PM on Wednesday. Does that work well for you?
--------------------
../data/data_ai/blogs/blogs_Llama-3.2-3B-Instruct.csv
I've had the pleasure of knowing Nadia for about three years now, and it wasn't until we became classmates in Grade 6 that I truly appreciated her exceptional qualities. As a diligent and kind prefect, Nadia is well-respected by her peers and teachers alike. She possesses a deep sense of sensitivity, which makes her a compassionate and empathetic individual.

What struck me most about Nadia was her unwavering dedication to her duties as a prefect. She took her responsibilities very seriously, often going above and beyond to ensure that everything ran smoothly. Her strict approach to her duties was not meant to be intimidating, but rather to help her classmates improve and reach their full potential.

Despite her tough exterior, Nadia is a kind and caring friend who is alw

In [10]:
def strip_brackets(s: str) -> str:
    """
    Remove a leading '", [' or '[' and a trailing '", ]' or ']' from the given string.
    """
    # Remove leading patterns
    for prefix in ('"', '[', '["'):
        if s.startswith(prefix):
            s = s[len(prefix):]
            break

    # Remove trailing patterns
    for suffix in ('"', ']', '"]'):
        if s.endswith(suffix):
            s = s[:-len(suffix)]
            break

    return s

In [11]:
for path in paths:
    print(path)
    sample = pd.read_csv(path).sample(1)
    sample["text"] = sample["text"].apply(strip_brackets)
    print(sample["text"].tolist()[0])
    print("-" * 20)

../data/data_ai/blogs/blogs_Phi-3-small-128k-instruct.csv
The Toronto Towers are excited to announce that we've selected talented outfielder Gary Thomasson in the twelfth round of the highly anticipated APBL draft. At 25 years old and hailing from San Diego, Thomasson brings a wealth of experience to our team, having spent the last five seasons with the San Francisco Giants. Throughout his major league career thus far, he has amassed 312 hits, 21 home runs, and 130 RBI, boasting a career batting average of .253. Thomasson will proudly don uniform number 12 as he joins the Toronto Towers.
--------------------
../data/data_ai/blogs/blogs_Llama-3.2-3B-Instruct.csv
Azim Premji, the Chairman of Wipro, has been named among the top 25 most powerful business leaders outside the United States by Fortune magazine. This prestigious recognition is a testament to Premji's exceptional leadership and vision in transforming Wipro into the largest publicly traded company in India's dynamic technology s