<a href="https://colab.research.google.com/github/HAL22/Kaggle-Competitions/blob/Toxic-Comment-Classification-Challenge/Toxic_Comment_Classification_Challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1. Importing libraries

In [1]:
%%capture
!pip install kaggle
import pandas as pd
!pip install datasets transformers[sentencepiece]
!apt install git-lfs
from datasets import load_dataset
from datasets import Dataset, DatasetDict, load_dataset
from sklearn.model_selection import train_test_split

In [2]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle
!chmod 600 /root/.kaggle/kaggle.json

2. Importing Huggingface libraries and logging in

In [3]:
!git config --global user.email "thethelafaltein@gmail.com"
!git config --global user.name "HAL22"

In [4]:
from huggingface_hub import notebook_login
notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token


In [5]:
from datasets import Features, Value, ClassLabel

3. Downloading data from Kaggle

In [6]:
%%capture
!kaggle competitions download -c jigsaw-toxic-comment-classification-challenge

4. Data processing

In [7]:
class_names = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
toxic_features = Features({'text': Value('string'), 'label': ClassLabel(names=class_names)})

In [8]:
# Assume in this directory /content you have test,train and test_labels csv
ds = (load_dataset("jigsaw_toxicity_pred", data_dir="/content", split='train')
        .train_test_split(train_size=800, test_size=200))

Downloading builder script:   0%|          | 0.00/5.77k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.77k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/5.57k [00:00<?, ?B/s]



Downloading and preparing dataset jigsaw_toxicity_pred/default (download: Unknown size, generated: 94.91 MiB, post-processed: Unknown size, total: 94.91 MiB) to /root/.cache/huggingface/datasets/jigsaw_toxicity_pred/default-data_dir=%2Fcontent/1.1.0/9cf096ac4341c35839bc8a9f6a19d93e18e5ad3d84cf05f690d2bc6f7384af85...


Generating train split:   0%|          | 0/159571 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/63978 [00:00<?, ? examples/s]

Dataset jigsaw_toxicity_pred downloaded and prepared to /root/.cache/huggingface/datasets/jigsaw_toxicity_pred/default-data_dir=%2Fcontent/1.1.0/9cf096ac4341c35839bc8a9f6a19d93e18e5ad3d84cf05f690d2bc6f7384af85. Subsequent calls will reuse this data.


In [9]:
ds

DatasetDict({
    train: Dataset({
        features: ['comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],
        num_rows: 800
    })
    test: Dataset({
        features: ['comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],
        num_rows: 200
    })
})

In [10]:
ds['train'][0]

{'comment_text': "Not 'involved', please. SV was the un blocking admin. Why the (possibly wrong)? Are we children.",
 'toxic': 0,
 'severe_toxic': 0,
 'obscene': 0,
 'threat': 0,
 'insult': 0,
 'identity_hate': 0}

In [11]:
ds['test'][0]

{'comment_text': "I'll gladly challenge that allegation. The history of the page speaks for itself.",
 'toxic': 0,
 'severe_toxic': 0,
 'obscene': 0,
 'threat': 0,
 'insult': 0,
 'identity_hate': 0}

In [12]:
# Making the data  ready for the model by creating a new column called labels
cols = ds["train"].column_names
ds = ds.map(lambda x : {"labels": [x[c] for c in cols if c != "comment_text"]})
ds

  0%|          | 0/800 [00:00<?, ?ex/s]

  0%|          | 0/200 [00:00<?, ?ex/s]

DatasetDict({
    train: Dataset({
        features: ['comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate', 'labels'],
        num_rows: 800
    })
    test: Dataset({
        features: ['comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate', 'labels'],
        num_rows: 200
    })
})

In [13]:
ds['train'][0]

{'comment_text': "Not 'involved', please. SV was the un blocking admin. Why the (possibly wrong)? Are we children.",
 'toxic': 0,
 'severe_toxic': 0,
 'obscene': 0,
 'threat': 0,
 'insult': 0,
 'identity_hate': 0,
 'labels': [0, 0, 0, 0, 0, 0]}

4.1 Tokenize

In [14]:
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, 
                          TrainingArguments, Trainer)
import torch

In [22]:
model = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model, problem_type="multi_label_classification")

In [23]:
def tokenize_and_encode(examples):
  return tokenizer(examples["comment_text"], truncation=True)

In [24]:
cols = ds["train"].column_names
print(cols)
cols.remove("labels")
print(cols)
ds_enc = ds.map(tokenize_and_encode, batched=True, remove_columns=cols)
ds_enc



['comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate', 'labels']
['comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 800
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 200
    })
})

In [25]:
ds_enc.set_format("torch")
ds_enc = (ds_enc
          .map(lambda x : {"float_labels": x["labels"].to(torch.float)}, remove_columns=["labels"])
          .rename_column("float_labels", "labels"))



  0%|          | 0/200 [00:00<?, ?ex/s]

In [26]:
ds_enc

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 800
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 200
    })
})

5. Load the model

In [27]:
num_labels=6
model_bert = AutoModelForSequenceClassification.from_pretrained(model, num_labels=num_labels, problem_type="multi_label_classification").to('cuda')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classi

In [41]:
args = TrainingArguments(num_train_epochs=3,output_dir="/content")

trainer = Trainer(model=model_bert, args=args, train_dataset=ds_enc["train"], eval_dataset=ds_enc["test"], tokenizer=tokenizer)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [42]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 200
  Batch size = 8


{'eval_loss': 0.05395181477069855,
 'eval_runtime': 1.8361,
 'eval_samples_per_second': 108.929,
 'eval_steps_per_second': 13.616}

In [44]:
eval = trainer.train()

***** Running training *****
  Num examples = 800
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 300


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




In [33]:
trainer.save_model(output_dir="/content")

Saving model checkpoint to /content
Configuration saved in /content/config.json
Model weights saved in /content/pytorch_model.bin
tokenizer config file saved in /content/tokenizer_config.json
Special tokens file saved in /content/special_tokens_map.json


In [36]:
val = trainer.evaluate()

***** Running Evaluation *****
  Num examples = 200
  Batch size = 8


In [38]:
val.values()

dict_values([0.05395181477069855, 1.8425, 108.547, 13.568, 3.0])

In [45]:
eval

TrainOutput(global_step=300, training_loss=0.028129644393920898, metrics={'train_runtime': 58.258, 'train_samples_per_second': 41.196, 'train_steps_per_second': 5.15, 'total_flos': 151125034452768.0, 'train_loss': 0.028129644393920898, 'epoch': 3.0})

In [46]:
predictions = trainer.predict(ds_enc['test'])

***** Running Prediction *****
  Num examples = 200
  Batch size = 8


In [47]:
predictions

PredictionOutput(predictions=array([[-6.738483 , -8.012385 , -7.4429736, -8.208766 , -7.135232 ,
        -6.091413 ],
       [-5.058591 , -8.1911745, -7.1381736, -8.415766 , -6.7620625,
        -5.9923944],
       [-6.5505953, -8.134446 , -7.4614654, -8.299474 , -7.1685963,
        -6.071233 ],
       ...,
       [-1.3327954, -7.3502736, -5.4077096, -7.719596 , -5.044203 ,
        -5.103164 ],
       [-3.536598 , -7.8566494, -6.5468326, -8.177834 , -6.1312313,
        -5.683433 ],
       [-6.707237 , -8.087137 , -7.434837 , -8.254186 , -7.112827 ,
        -6.135437 ]], dtype=float32), label_ids=array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]], dtype=float32), metrics={'test_loss': 0.05543757602572441, 'test_runtime': 2.0066, 'test_samples_per_second': 99.673, 'test_steps_per_second': 12.459})

In [48]:
model_bert.save_pretrained("fine_tuned_model")



Configuration saved in fine_tuned_model/config.json
Model weights saved in fine_tuned_model/pytorch_model.bin


In [50]:
from transformers import pipeline
clf = pipeline("text-classification", "/content/fine_tuned_model")

loading configuration file /content/fine_tuned_model/config.json
Model config DistilBertConfig {
  "_name_or_path": "/content/fine_tuned_model",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "multi_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.23.1",
  "vocab_size": 30522
}

loading configuratio

In [53]:
clf("Yo bitch Ja Rule is more succesful then you'll ever be whats up with you and hating you sad mofuckas...i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me",return_all_scores=True)

[[{'label': 'LABEL_0', 'score': 0.9765037894248962},
  {'label': 'LABEL_1', 'score': 0.30014175176620483},
  {'label': 'LABEL_2', 'score': 0.9280667901039124},
  {'label': 'LABEL_3', 'score': 0.06726877391338348},
  {'label': 'LABEL_4', 'score': 0.8652555346488953},
  {'label': 'LABEL_5', 'score': 0.15145337581634521}]]

In [54]:
df = pd.read_csv("/content/train.csv")

In [55]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
