In [None]:
import joblib

In [None]:
# Install gdown to download from Google Drive
!pip install gdown

# Download the saved model from Google Drive
!gdown --id 1eHZe445OQNc2GE7rdzVc7i0-akJPcLuK -O bert-toxic-comment-classification.zip

# Unzip the ZIP file
import zipfile

# Extract the ZIP file into the specified directory
with zipfile.ZipFile("bert-toxic-comment-classification.zip", 'r') as zip_ref:
    zip_ref.extractall("bert-toxic-comment-classification")  # Extracts into the specified directory


Downloading...
From (original): https://drive.google.com/uc?id=1eHZe445OQNc2GE7rdzVc7i0-akJPcLuK
From (redirected): https://drive.google.com/uc?id=1eHZe445OQNc2GE7rdzVc7i0-akJPcLuK&confirm=t&uuid=6ed96a94-f476-441b-a2f9-e46819f361cd
To: /content/bert-toxic-comment-classification.zip
100% 247M/247M [00:10<00:00, 23.7MB/s]


In [None]:
# Download the saved model from Google Drive
!gdown --id 14j1FohHhUFuJKiX1PGq4P538H4ja_KjO -O logistic_regression_multilabel.zip

# Unzip the ZIP file into the current directory
import zipfile

with zipfile.ZipFile("logistic_regression_multilabel.zip", 'r') as zip_ref:
    zip_ref.extractall()  # Extracts into the current working directory

print("Files successfully extracted into the current directory.")


Downloading...
From: https://drive.google.com/uc?id=14j1FohHhUFuJKiX1PGq4P538H4ja_KjO
To: /content/logistic_regression_multilabel.zip
100% 462k/462k [00:00<00:00, 132MB/s]
Archive:  logistic_regression_multilabel.zip
  inflating: logistic_regression_multilabel.joblib  


In [None]:
# Download the ZIP file from Google Drive TODO
!gdown --id 1e4eT92P-i_QGcBEwHKf66iLn1aSpMr8e -O tokenized_test_data.zip

# Unzip the file
import shutil
shutil.unpack_archive('tokenized_test_data.zip')

print("File unzipped. Ready to load.")


Downloading...
From (original): https://drive.google.com/uc?id=1e4eT92P-i_QGcBEwHKf66iLn1aSpMr8e
From (redirected): https://drive.google.com/uc?id=1e4eT92P-i_QGcBEwHKf66iLn1aSpMr8e&confirm=t&uuid=d9016573-7d6a-4c24-805a-a5415f1dbcda
To: /content/tokenized_test_data.zip
100% 7.85M/7.85M [00:00<00:00, 86.9MB/s]
File unzipped. Ready to load.


In [None]:
# Download the saved model from Google Drive
!gdown --id 1LDmkBpIjEJy3rMghZEF3evM2Imt0wu9g -O test_processed.zip

# Unzip the ZIP file
with zipfile.ZipFile("test_processed.zip", 'r') as zip_ref:
    zip_ref.extractall()  # Extracts into the current working directory


Downloading...
From: https://drive.google.com/uc?id=1LDmkBpIjEJy3rMghZEF3evM2Imt0wu9g
To: /content/test_processed.zip
100% 13.3M/13.3M [00:00<00:00, 31.8MB/s]


In [None]:
from transformers import DistilBertForSequenceClassification

# Path to the folder of the extracted model
model_path = "bert-toxic-comment-classification"

# Load the fine-tuned model
model = DistilBertForSequenceClassification.from_pretrained(model_path)


In [None]:
import torch

# Path to the saved file
test_encodings = torch.load("tokenized_test_data.pt")
print("Test encodings loaded successfully!")


Test encodings loaded successfully!


  test_encodings = torch.load("/content/tokenized_test_data.pt")


In [None]:
from torch.utils.data import DataLoader, TensorDataset

# Configure the device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

# Create a TensorDataset from the tokenized data
test_dataset = TensorDataset(
    test_encodings['input_ids'],
    test_encodings['attention_mask']
)

# Create a DataLoader to process the data in batches
test_loader = DataLoader(test_dataset, batch_size=1024, shuffle=False)

from tqdm import tqdm

# Generate predictions in batches with a progress bar
all_predictions = []
model.to(device)  # Move the model to the device
model.eval()
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Generating Predictions"):
        # Move the batch tensors to the device
        input_ids, attention_mask = batch[0].to(device), batch[1].to(device)
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        all_predictions.append(torch.sigmoid(outputs.logits).cpu().numpy())

# Combine the results from all batches
import numpy as np
predictions = np.concatenate(all_predictions, axis=0)


cuda


Generating Predictions: 100%|██████████| 63/63 [03:16<00:00,  3.13s/it]


In [None]:
import pandas as pd

# Assuming the predictions are a numpy array
predictions_df = pd.DataFrame(predictions, columns=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'])
predictions_df.to_csv("predictions_distillbert.csv", index=False)
torch.save(predictions, "predictions_distillbert.pt")

print("Predictions saved")


Predictions saved


In [None]:
X_test = joblib.load("X_test.joblib")
y_test = joblib.load("y_test.joblib")
model_logit = joblib.load("logistic_regression_multilabel.joblib")

In [None]:
y_pred_proba = model_logit.predict_proba(X_test)  # Probabilities for each class
y_pred_proba


array([[0.05089526, 0.00291716, 0.0122912 , 0.00109066, 0.04976864,
        0.0121839 ],
       [0.94871337, 0.01377046, 0.31947876, 0.04932912, 0.41739841,
        0.01525318],
       [0.22978899, 0.01510436, 0.09629121, 0.00213213, 0.06137356,
        0.04954296],
       ...,
       [0.95861711, 0.10804377, 0.25103032, 0.00772219, 0.47969861,
        0.96379178],
       [0.9989163 , 0.92975489, 0.99935146, 0.01090201, 0.99657519,
        0.59801708],
       [0.14908288, 0.02588381, 0.02015762, 0.00904053, 0.04776308,
        0.02741077]])

In [None]:
import pandas as pd
predictions_logit_df = pd.DataFrame(y_pred_proba, columns=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'])
predictions_logit_df.to_csv("predictions_logistic_regression.csv", index=False)
print("Predictions saved to predictions_logistic_regression.csv")

Predictions saved to predictions_logistic_regression.csv
