In [1]:
import torch 
from datasets import load_dataset

In [2]:
dataset = load_dataset('csv', data_files={"train": "data/train.csv", "test": "data/test.csv"})
# dataset = load_dataset("yelp_review_full")

In [3]:
# choose a subset of train data to explore the dataset
train_data = dataset["train"].select(range(5000))

In [4]:
# for the mbti column, if it ends with J, label it as 1, otherwise 0
train_data = train_data.map(lambda x: {"mbti": 1 if x["mbti"].endswith("J") else 0})

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [5]:
# split that into train and validation set using datasets library
train_data = train_data.train_test_split(test_size=0.2)
train_data

DatasetDict({
    train: Dataset({
        features: ['body', 'mbti'],
        num_rows: 4000
    })
    test: Dataset({
        features: ['body', 'mbti'],
        num_rows: 1000
    })
})

In [6]:
# print the first 5 rows of the train set
train_data["train"][0:5]

{'body': ['c9 with sing had more balls than fnatic and picked it hopefully well see it again this year with our one true meepo madoka sakura trick strawberry panic catholic school for lesbians the animation and my favorite yuruyuri and romance animes cough cough dusk maiden of amnesia cough cough and bounty hunter hero has 100 winrate and is being first banned now',
  'the thin attractive ones are waiters now for the girls 11 is me what is the reference? actual question im just getting back into boarding and we have some nice hills in my area that i want to try to go down any recommendations on good helmets pads gloves etc? oh see i thought you were pointing to one of the names of the drinks that was supposed to make a connection  now i get it',
  'you seem to misunderstand pzombies by definition dont have a subjective view ks fan? besides kenji which route is your favorite?  though im guessing you just stopped after you got the manly picnic  nice waifu btw both are from europe majoras

In [7]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
def tokenize_function(examples):
    return tokenizer(examples["body"], padding="max_length", truncation=True)
tokenized_datasets = train_data.map(tokenize_function, batched=True)

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [9]:
torch.cuda.empty_cache()

In [10]:
tokenized_datasets = tokenized_datasets.remove_columns(["body"])
tokenized_datasets = tokenized_datasets.rename_column("mbti", "labels")
tokenized_datasets.set_format("torch")

In [11]:
# small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
# small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
small_train_dataset = tokenized_datasets["train"]
small_eval_dataset = tokenized_datasets["test"]

In [12]:
small_train_dataset

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 4000
})

In [13]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=2)
eval_dataloader = DataLoader(small_eval_dataset, batch_size=2)

In [14]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("microsoft/Phi-3-mini-4k-instruct", num_labels=2)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of Phi3ForSequenceClassification were not initialized from the model checkpoint at microsoft/Phi-3-mini-4k-instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [16]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [17]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

Phi3ForSequenceClassification(
  (model): Phi3Model(
    (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x Phi3DecoderLayer(
        (self_attn): Phi3Attention(
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (qkv_proj): Linear(in_features=3072, out_features=9216, bias=False)
          (rotary_emb): Phi3RotaryEmbedding()
        )
        (mlp): Phi3MLP(
          (gate_up_proj): Linear(in_features=3072, out_features=16384, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (activation_fn): SiLU()
        )
        (input_layernorm): Phi3RMSNorm()
        (resid_attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
        (post_attention_layernorm): Phi3RMSNorm()
      )
    )
    (norm): Phi3RMSNorm()
  )
  (score): Linear(in_features=3072, out_f

In [18]:
from tqdm.auto import tqdm
torch.cuda.empty_cache()
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        # batch = {k: v.to(device) for k, v in batch.items()}
        batch = {k: torch.Tensor(v).to('cuda') for k, v in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/6000 [00:00<?, ?it/s]

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
You are not running the flash-attention implementation, expect numerical differences.


OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU 

In [None]:
import evaluate

metric = evaluate.load("accuracy")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 0.573}