<a href="https://colab.research.google.com/github/HAL22/Kaggle-Competitions/blob/Toxic-Comment-Classification-Challenge/Toxic_Comment_Classification_Challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1. Importing libraries

In [7]:
%%capture
!pip install kaggle
import pandas as pd
!pip install datasets transformers[sentencepiece]
!apt install git-lfs
from datasets import load_dataset
from datasets import Dataset, DatasetDict, load_dataset
from sklearn.model_selection import train_test_split
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer)
import torch
from transformers import pipeline
from datasets import Features, Value, ClassLabel

In [2]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle
!chmod 600 /root/.kaggle/kaggle.json

2. Importing Huggingface libraries and logging in

In [3]:
!git config --global user.email "thethelafaltein@gmail.com"
!git config --global user.name "HAL22"

In [5]:
from huggingface_hub import notebook_login
notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token


3. Downloading data from Kaggle

In [None]:
%%capture
###!kaggle competitions download -c jigsaw-toxic-comment-classification-challenge

4. Data processing

In [155]:
class_names = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
toxic_features = Features({'text': Value('string'), 'label': ClassLabel(names=class_names)})

In [156]:
df_validation = pd.read_csv('test.csv')
df_validation = df_validation[['comment_text']].copy()
validation = Dataset.from_pandas(df_validation)

In [194]:
# Assume in this directory /content you have test,train and test_labels csv
ds = (load_dataset("jigsaw_toxicity_pred", data_dir="/content", split='train')
        .train_test_split(train_size=800, test_size=200))



In [174]:
ds

DatasetDict({
    train: Dataset({
        features: ['comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],
        num_rows: 800
    })
    test: Dataset({
        features: ['comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],
        num_rows: 200
    })
})

In [175]:
ds['train'][0]

{'comment_text': "While adding a link to an actress's IMDB page is fine, it doesn't do much to establish notability.  Like Wikipedia, it is user contributed information and anyone and everyone in the industry can be listed there.  Wikipedia's guidelines are more stringent.  Significant coverage in 3rd party sources has got to be shown here.",
 'toxic': 0,
 'severe_toxic': 0,
 'obscene': 0,
 'threat': 0,
 'insult': 0,
 'identity_hate': 0}

In [176]:
ds['test'][0]

{'comment_text': 'Governor of the Bank of Jamaica.Im jamaican and i know for a fact that Mr.latibeaudiere resigned on october 30 2009 please look it up \n\nfine keep your bullshit article because i know you know nothing about jamaica,nobody relies on wikipedia here because its full of impearlist assholes like you.up yours with best regards',
 'toxic': 1,
 'severe_toxic': 0,
 'obscene': 1,
 'threat': 0,
 'insult': 0,
 'identity_hate': 0}

In [177]:
# Making the data  ready for the model by creating a new column called labels
cols = ds["train"].column_names
cols

['comment_text',
 'toxic',
 'severe_toxic',
 'obscene',
 'threat',
 'insult',
 'identity_hate']

In [195]:
ds = ds.map(lambda x : {"labels": [x[c] for c in cols if c != "comment_text"]})


  0%|          | 0/800 [00:00<?, ?ex/s]

  0%|          | 0/200 [00:00<?, ?ex/s]

In [179]:
ds['validation'] = validation

In [185]:
ds['train'][0]

{'comment_text': '...What exactly is your fascination with John Wayne?',
 'toxic': 0,
 'severe_toxic': 0,
 'obscene': 0,
 'threat': 0,
 'insult': 0,
 'identity_hate': 0}

4.1 Tokenize

In [187]:
model = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model, problem_type="multi_label_classification")

In [188]:
def tokenize_and_encode(examples):
  return tokenizer(examples["comment_text"], truncation=True)

In [196]:
cols = ds["train"].column_names
print(cols)
cols.remove("labels")
print(cols)
ds_enc = ds.map(tokenize_and_encode, batched=True, remove_columns=cols)
ds_enc

ds['validation'] = validation

xc = ds.map(tokenize_and_encode, batched=True)

['comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate', 'labels']
['comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/154 [00:00<?, ?ba/s]

In [150]:
ds_enc.set_format("torch")
ds_enc = (ds_enc
          .map(lambda x : {"float_labels": x["labels"].to(torch.float)}, remove_columns=["labels"])
          .rename_column("float_labels", "labels"))

  0%|          | 0/800 [00:00<?, ?ex/s]

  0%|          | 0/200 [00:00<?, ?ex/s]

In [151]:
ds_enc

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 800
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 200
    })
})

5. Load the model

In [85]:
num_labels=6
model_bert = AutoModelForSequenceClassification.from_pretrained(model, num_labels=num_labels, problem_type="multi_label_classification").to('cuda')

In [86]:
args = TrainingArguments(num_train_epochs=3,output_dir="/content")

trainer = Trainer(model=model_bert, args=args, train_dataset=ds_enc["train"], eval_dataset=ds_enc["test"], tokenizer=tokenizer)

In [87]:
trainer.evaluate()

{'eval_loss': 0.7036134600639343,
 'eval_runtime': 1.5437,
 'eval_samples_per_second': 129.558,
 'eval_steps_per_second': 16.195}

In [88]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=300, training_loss=0.10911513010660807, metrics={'train_runtime': 58.0695, 'train_samples_per_second': 41.33, 'train_steps_per_second': 5.166, 'total_flos': 165041313358464.0, 'train_loss': 0.10911513010660807, 'epoch': 3.0})

In [90]:
trainer.save_model(output_dir="/content")

In [91]:
model_bert.save_pretrained("fine_tuned_model")



In [103]:
from transformers import pipeline
clf = pipeline("text-classification", '/content/fine_tuned_model')

In [104]:
tx = ["Yo bitch Ja Rule is more succesful then you'll ever be whats up with you and hating you sad mofuckas...i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me","I hate you"]

In [105]:
clf(tx,return_all_scores=True)

[[{'label': 'toxic', 'score': 0.8654440641403198},
  {'label': 'severe_toxic', 'score': 0.3417045772075653},
  {'label': 'obscene', 'score': 0.8013592958450317},
  {'label': 'threat', 'score': 0.11946466565132141},
  {'label': 'insult', 'score': 0.7284992933273315},
  {'label': 'identity_hate', 'score': 0.2642216384410858}],
 [{'label': 'toxic', 'score': 0.16032901406288147},
  {'label': 'severe_toxic', 'score': 0.011512413620948792},
  {'label': 'obscene', 'score': 0.03239469602704048},
  {'label': 'threat', 'score': 0.007287683896720409},
  {'label': 'insult', 'score': 0.04961101710796356},
  {'label': 'identity_hate', 'score': 0.015737442299723625}]]

In [None]:
df = pd.read_csv("/content/train.csv")

In [124]:
df_validation = pd.read_csv('test.csv')
df_validation = df_validation[['id']].copy()
validation = Dataset.from_pandas(df_validation)

In [190]:
ds['validation'] = validation

In [197]:
pred_eval = trainer.predict(xc['validation'])

In [198]:
pred_eval

PredictionOutput(predictions=array([[ 1.8825537 , -0.58391505,  1.4357582 , -1.9458671 ,  1.0194995 ,
        -1.0099435 ],
       [-3.9156053 , -5.6584735 , -5.064094  , -5.7464848 , -4.7357693 ,
        -5.204923  ],
       [-3.88495   , -5.653568  , -5.0319605 , -5.7351794 , -4.669344  ,
        -5.1810713 ],
       ...,
       [-4.039108  , -5.6276646 , -5.080306  , -5.6804123 , -4.7692933 ,
        -5.139193  ],
       [-3.9828324 , -5.6483154 , -5.089322  , -5.73453   , -4.721493  ,
        -5.1664257 ],
       [-0.12129276, -3.4935858 , -1.5155396 , -4.0966935 , -1.6263384 ,
        -3.0770195 ]], dtype=float32), label_ids=None, metrics={'test_runtime': 1346.6449, 'test_samples_per_second': 113.737, 'test_steps_per_second': 14.218})

In [205]:
eval_sub = np.argmax(pred_eval.predictions, axis=-1)

In [206]:
len(eval_sub)

153164

In [209]:
eval_sub

array([0, 0, 0, ..., 0, 0, 0])

In [207]:
submission = pd.read_csv('/content/sample_submission.csv')
submission.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.5,0.5,0.5,0.5,0.5,0.5
1,0000247867823ef7,0.5,0.5,0.5,0.5,0.5,0.5
2,00013b17ad220c46,0.5,0.5,0.5,0.5,0.5,0.5
3,00017563c3f7919a,0.5,0.5,0.5,0.5,0.5,0.5
4,00017695ad8997eb,0.5,0.5,0.5,0.5,0.5,0.5


In [208]:
submission[class_names] = eval_sub

ValueError: ignored

In [204]:
submission.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,1.882554,-0.583915,1.435758,-1.945867,1.0195,-1.009943
1,0000247867823ef7,-3.915605,-5.658473,-5.064094,-5.746485,-4.735769,-5.204923
2,00013b17ad220c46,-3.88495,-5.653568,-5.03196,-5.735179,-4.669344,-5.181071
3,00017563c3f7919a,-4.029534,-5.596353,-5.067165,-5.687569,-4.746125,-5.155338
4,00017695ad8997eb,-3.736657,-5.681209,-5.056615,-5.802628,-4.65236,-5.182059


In [None]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
