In [1]:
import pandas as pd
import numpy as np
data=pd.read_csv('/content/drive/MyDrive/kaggle api/cleaned_text.csv')
data.head()

Unnamed: 0,Text,Label
0,feel really helpless heavy hearted,4
1,ive enjoyed able slouch relax unwind frankly n...,0
2,gave internship dmrg feeling distraught,4
3,dont know feel lost,0
4,kindergarten teacher thoroughly weary job take...,4


In [2]:
# Check for missing or non-string values
print(data['Text'].isnull().sum())                # Count NaNs
print((data['Text'].apply(lambda x: not isinstance(x, str))).sum())  # Count non-string types

16
16


In [3]:
# Drop rows with missing or non-string Text
data = data.dropna(subset=['Text'])  # Drop rows with NaN in 'Text'
data = data[data['Text'].apply(lambda x: isinstance(x, str))]  # Keep only strings
data = data[data['Text'].str.strip() != ""]  # Remove empty strings

In [4]:
data.value_counts("Label")

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
1,141064
0,121183
3,57311
4,47709
2,34554
5,14972


In [5]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [6]:
!pip install datasets



In [7]:
from sklearn.model_selection import train_test_split
from datasets import Dataset
train_df, val_df = train_test_split(data, test_size=0.2, random_state=42, stratify=data['Label'])
print(f"Training set size: {len(train_df)}, Validation set size: {len(val_df)}")
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

Training set size: 333434, Validation set size: 83359


In [8]:
def tokenize(example):
    return tokenizer(example["Text"], padding='max_length', truncation=True)

train_dataset = train_dataset.rename_column("Label", "labels")
val_dataset = val_dataset.rename_column("Label", "labels")

tokenized_train = train_dataset.map(tokenize, batched=True)
tokenized_val = val_dataset.map(tokenize, batched=True)

tokenized_train.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_val.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

Map:   0%|          | 0/333434 [00:00<?, ? examples/s]

Map:   0%|          | 0/83359 [00:00<?, ? examples/s]

In [9]:
from transformers import (
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EvalPrediction,
    set_seed
)

num_labels = data['Label'].nunique()
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-cased', num_labels=num_labels)

model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
!pip install weave



In [11]:
!wandb login

[34m[1mwandb[0m: Currently logged in as: [33mkhoidang1209[0m ([33mkhoidang1209-international-university-vnu-hcmc[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [12]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [13]:
import torch
print(torch.cuda.is_available())   # Should print True
print(torch.cuda.get_device_name(0))  # Shows GPU name if available


True
Tesla T4


In [14]:
!pip install accelerate



In [16]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [17]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    fp16=True,  # Enable Mixed Precision
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

def compute_metrics(eval_pred):
    from sklearn.metrics import accuracy_score, precision_recall_fscore_support
    logits, labels = eval_pred
    preds = logits.argmax(axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mkhoidang1209[0m ([33mkhoidang1209-international-university-vnu-hcmc[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.096,0.090183,0.94141,0.941988,0.946566,0.94141
2,0.0834,0.087584,0.941338,0.940317,0.945784,0.941338
3,0.0806,0.088777,0.940114,0.940398,0.941628,0.940114


TrainOutput(global_step=7815, training_loss=0.11111535601942339, metrics={'train_runtime': 11628.1025, 'train_samples_per_second': 86.025, 'train_steps_per_second': 0.672, 'total_flos': 1.3251685613613466e+17, 'train_loss': 0.11111535601942339, 'epoch': 3.0})