In [1]:
import joblib

In [2]:
# Download the ZIP file from the Google Drive link
!gdown --id 1lhwb66fn2iazwWaAqGOFB0ZOD8vm-BDT -O data.zip

# Extract the ZIP file into /content/data and remove the ZIP files
import os
import zipfile

zip_path = 'data.zip'
output_dir = 'data'
os.makedirs(output_dir, exist_ok=True)

# Extract the main ZIP file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(output_dir)

# Remove the main ZIP file
os.remove(zip_path)
print(f"Removed zip: {zip_path}")

# Remove any nested ZIP files in the destination folder
for root, _, files in os.walk(output_dir):
    for file in files:
        if file.endswith('.zip'):
            zip_file_path = os.path.join(root, file)
            with zipfile.ZipFile(zip_file_path, 'r') as nested_zip_ref:
                nested_zip_ref.extractall(output_dir)
            os.remove(zip_file_path)
            print(f"Removed nested zip: {zip_file_path}")

# Display the extracted files
print("Extracted files in:", output_dir)
print("Contents of the folder:", os.listdir(output_dir))


Downloading...
From (original): https://drive.google.com/uc?id=1lhwb66fn2iazwWaAqGOFB0ZOD8vm-BDT
From (redirected): https://drive.google.com/uc?id=1lhwb66fn2iazwWaAqGOFB0ZOD8vm-BDT&confirm=t&uuid=0c1296b8-9983-46d6-a25e-e83366285eaa
To: /content/data.zip
100% 55.2M/55.2M [00:00<00:00, 56.0MB/s]
Removed zip: data.zip
Removed nested zip: data/sample_submission.csv.zip
Removed nested zip: data/test_labels.csv.zip
Removed nested zip: data/train.csv.zip
Removed nested zip: data/test.csv.zip
Extracted files in: data
Contents of the folder: ['sample_submission.csv', 'test.csv', 'test_labels.csv', 'train.csv']


In [None]:
# Install gdown if not already installed
!pip install gdown

# Download the ZIP file from Google Drive
!gdown --id 1tWp3zkdMZph8qyNCZg_TTIJXIEuZ1pR9 -O tokenized_data.zip

# Unzip the file
import shutil
shutil.unpack_archive('tokenized_data.zip')

print("File unzipped. Ready to load.")


Downloading...
From (original): https://drive.google.com/uc?id=1tWp3zkdMZph8qyNCZg_TTIJXIEuZ1pR9
From (redirected): https://drive.google.com/uc?id=1tWp3zkdMZph8qyNCZg_TTIJXIEuZ1pR9&confirm=t&uuid=6bf5e6f9-3a17-4318-8496-80a272bede49
To: /content/tokenized_data.zip
100% 20.1M/20.1M [00:00<00:00, 46.6MB/s]
File unzipped. Ready to load.


In [None]:
# Install gdown if not already installed
!pip install gdown

# Download the ZIP file from Google Drive
!gdown --id 1SfK35wgeo8F4XPbSE3spyIxHH4EDs-K3 -O train_processed.zip

# Unzip the file
import shutil
shutil.unpack_archive('train_processed.zip')

print("File unzipped. Ready to load.")


Downloading...
From (original): https://drive.google.com/uc?id=1SfK35wgeo8F4XPbSE3spyIxHH4EDs-K3
From (redirected): https://drive.google.com/uc?id=1SfK35wgeo8F4XPbSE3spyIxHH4EDs-K3&confirm=t&uuid=a1dcb480-26c5-4f84-aa24-5b344846a90e
To: /content/train_processed.zip
100% 36.4M/36.4M [00:01<00:00, 32.1MB/s]
File unzipped. Ready to load.


In [None]:
import pandas as pd
train_path = 'data/train.csv'
df_train = pd.read_csv(train_path)

In [None]:
X_train = joblib.load("X_train.joblib")
y_train = joblib.load("y_train.joblib")

In [None]:
import torch

# Load the tokenized data
file_path = 'tokenized_train_data.pt'
saved_data = torch.load(file_path)

# Move data to GPU if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'

print("STO USANDO " , device)
tokenized_data = {
    'input_ids': saved_data['input_ids'].to(device),
    'attention_mask': saved_data['attention_mask'].to(device),
}

print("Tokenized data loaded and moved to", device)


  saved_data = torch.load(file_path)


STO USANDO  cuda
Tokenized data loaded and moved to cuda


In [None]:
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

class ToxicCommentsDataset(Dataset):
    def __init__(self, input_ids, attention_mask, labels):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels.float()  # Convert to float

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }


# Split data into training and validation sets
train_indices, val_indices = train_test_split(
    range(len(tokenized_data['input_ids'])), test_size=0.2, random_state=42
)

train_dataset = ToxicCommentsDataset(
    input_ids=tokenized_data['input_ids'][train_indices],
    attention_mask=tokenized_data['attention_mask'][train_indices],
    labels=torch.tensor(df_train.iloc[train_indices][['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values)
)

val_dataset = ToxicCommentsDataset(
    input_ids=tokenized_data['input_ids'][val_indices],
    attention_mask=tokenized_data['attention_mask'][val_indices],
    labels=torch.tensor(df_train.iloc[val_indices][['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values)
)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)


In [None]:
from transformers import DistilBertForSequenceClassification

# Load DistilBERT with a classification head
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=6,  # Multilabel task
    problem_type="multi_label_classification"
)

# Move model to GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [None]:
from torch.optim import AdamW
from transformers import get_scheduler

# Loss function for multilabel classification
loss_fn = torch.nn.BCEWithLogitsLoss()

# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)  # Abbassato il learning rate per un training più lungo
num_epochs = 3
num_training_steps = len(train_loader) * num_epochs  # Calcolato per 3 epoche
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

from tqdm import tqdm

# Training loop
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")

    # Training phase
    model.train()
    train_loss = 0
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}"):
        optimizer.zero_grad()
        outputs = model(
            input_ids=batch['input_ids'].to(device),
            attention_mask=batch['attention_mask'].to(device),
            labels=batch['labels'].to(device)
        )
        loss = outputs.loss
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

    print(f"Training Loss: {train_loss / len(train_loader):.4f}")

    # Validation phase
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Validation Epoch {epoch + 1}"):
            outputs = model(
                input_ids=batch['input_ids'].to(device),
                attention_mask=batch['attention_mask'].to(device),
                labels=batch['labels'].to(device)
            )
            val_loss += outputs.loss.item()

    print(f"Validation Loss: {val_loss / len(val_loader):.4f}")


Epoch 1/3


Training Epoch 1: 100%|██████████| 1995/1995 [20:13<00:00,  1.64it/s]


Training Loss: 0.0584


Validation Epoch 1: 100%|██████████| 499/499 [01:47<00:00,  4.62it/s]


Validation Loss: 0.0398
Epoch 2/3


Training Epoch 2: 100%|██████████| 1995/1995 [20:16<00:00,  1.64it/s]


Training Loss: 0.0354


Validation Epoch 2: 100%|██████████| 499/499 [01:47<00:00,  4.62it/s]


Validation Loss: 0.0374
Epoch 3/3


Training Epoch 3: 100%|██████████| 1995/1995 [20:15<00:00,  1.64it/s]


Training Loss: 0.0298


Validation Epoch 3: 100%|██████████| 499/499 [01:48<00:00,  4.62it/s]

Validation Loss: 0.0382





In [None]:
# Save fine-tuned model
model.save_pretrained('bert-toxic-comment-classification')
print("Model saved!")


Model saved!


In [None]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report

# Create a Logistic Regression model with OneVsRest
model_logit = OneVsRestClassifier(LogisticRegression(max_iter=1000, class_weight="balanced"))

# Train the model on all labels simultaneously
model_logit.fit(X_train, y_train)


In [None]:
joblib.dump(model_logit, "logistic_regression_multilabel.joblib")


['logistic_regression_multilabel.joblib']