### Importing Necessary libraries

In [5]:
import pandas as pd
import re 
import torch


In [6]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



### Loading Dataset

In [26]:
df = pd.read_csv('hate.csv', encoding='ISO-8859-1')
df.head(10)


Unnamed: 0.1,Unnamed: 0,comment,label
0,1,dalits are lowlives,N
1,2,gay people are a burden to our society,N
2,3,Arabs are not welcome,N
3,4,I'm not saying we should actually eliminate he...,N
4,5,bananas are for black people,N
5,6,asian people are the shit,P
6,7,women can not reproduce kids but men can in al...,N
7,8,i N wearing black in the summer!,P
8,9,Who cares what Chinese people think?,N
9,10,i try to punch a letter box when i see one,N


# Cleaning the data

In [27]:
# Check for missing values
print(df.isnull().sum())

# Drop rows with missing values
df.dropna(subset=['comment', 'label'], inplace=True)

Unnamed: 0    0
comment       0
label         0
dtype: int64


In [29]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,comment,label
0,1,dalits are lowlives,N
1,2,gay people are a burden to our society,N
2,3,Arabs are not welcome,N
3,4,I'm not saying we should actually eliminate he...,N
4,5,bananas are for black people,N
5,6,asian people are the shit,P
6,7,women can not reproduce kids but men can in al...,N
7,8,i N wearing black in the summer!,P
8,9,Who cares what Chinese people think?,N
9,10,i try to punch a letter box when i see one,N


In [30]:
# checking for duplicates 

df.drop_duplicates(subset=['comment'], inplace=True)


In [31]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,comment,label
0,1,dalits are lowlives,N
1,2,gay people are a burden to our society,N
2,3,Arabs are not welcome,N
3,4,I'm not saying we should actually eliminate he...,N
4,5,bananas are for black people,N
5,6,asian people are the shit,P
6,7,women can not reproduce kids but men can in al...,N
7,8,i N wearing black in the summer!,P
8,9,Who cares what Chinese people think?,N
9,10,i try to punch a letter box when i see one,N


In [32]:
# checking the lavle column 
print(df['label'].unique())
df.head(10)


['N' 'P' 'O']


Unnamed: 0.1,Unnamed: 0,comment,label
0,1,dalits are lowlives,N
1,2,gay people are a burden to our society,N
2,3,Arabs are not welcome,N
3,4,I'm not saying we should actually eliminate he...,N
4,5,bananas are for black people,N
5,6,asian people are the shit,P
6,7,women can not reproduce kids but men can in al...,N
7,8,i N wearing black in the summer!,P
8,9,Who cares what Chinese people think?,N
9,10,i try to punch a letter box when i see one,N


In [33]:
# converting labels  to a consistent format 


# Assuming 'P' is for positive and 'N' is for negative and 'O' for nuetral or others
df['label'] = df['label'].map({'P': 1, 'N': 0, 'O': 2})
df.head(10)


Unnamed: 0.1,Unnamed: 0,comment,label
0,1,dalits are lowlives,0
1,2,gay people are a burden to our society,0
2,3,Arabs are not welcome,0
3,4,I'm not saying we should actually eliminate he...,0
4,5,bananas are for black people,0
5,6,asian people are the shit,1
6,7,women can not reproduce kids but men can in al...,0
7,8,i N wearing black in the summer!,1
8,9,Who cares what Chinese people think?,0
9,10,i try to punch a letter box when i see one,0


## Pre-processing the data

In [34]:
# Removing special characters and digits

df['cleaned_text'] = df['comment'].apply(lambda x: re.sub('[^A-Za-z\s]', '', x))


In [35]:
# Converting to lowercase
df['cleaned_text'] = df['cleaned_text'].apply(lambda x: x.lower())
df['cleaned_text'].head(5)


0                                  dalits are lowlives
1               gay people are a burden to our society
2                                arabs are not welcome
3    im not saying we should actually eliminate hee...
4                         bananas are for black people
Name: cleaned_text, dtype: object

# Model selection & Model Training

##### According to the instructions, we'll select the BART (Bidirectional and Auto-Regressive Transformers) model. BART has proven to be effective in various NLP tasks, including sentiment analysis.

In [36]:
from transformers import BartTokenizer, BartForSequenceClassification, Trainer, TrainingArguments
import tensorflow as tf
from sklearn.model_selection import train_test_split
import torch
import transformers
import accelerate

In [52]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments

# As the previous run was interrupted I am using checkpoint, I am continuing the training from checkpoit 6000 
trainer.train(resume_from_checkpoint=r"C:\Users\Karan Singh\100 days of Machine Learning\Protfolio Projects\Techdome Task\results\checkpoint-6000")


# Assuming df is already loaded and cleaned
# Splitting data into training and testing sets
train_data, test_data = train_test_split(df, test_size=0.1, random_state=30)

# Use DistilBERT's tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
train_encodings = tokenizer(train_data['cleaned_text'].tolist(), truncation=True, padding=True, max_length=128, return_tensors="pt")
test_encodings = tokenizer(test_data['cleaned_text'].tolist(), truncation=True, padding=True, max_length=128, return_tensors="pt")

# Custom Dataset
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(int(self.labels[idx]), dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# Assuming labels are integers (0, 1, 2)
train_dataset = CustomDataset(train_encodings.data, train_data['label'].tolist())
test_dataset = CustomDataset(test_encodings.data, test_data['label'].tolist())

# Calculate steps per epoch
batch_size = 8  # Adjust this based on your GPU's memory
steps_per_epoch = len(train_data) // batch_size

# Initialize DistilBERT for sequence classification
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=2,
    logging_dir='./logs',
    logging_steps=steps_per_epoch,
    evaluation_strategy="epoch",
    do_train=True,
    do_eval=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()




 65%|██████▍   | 6008/9256 [00:35<00:18, 171.34it/s]
100%|█████████▉| 9254/9256 [3:50:03<00:08,  4.14s/it]  

{'loss': 0.342, 'learning_rate': 1.08038029386344e-08, 'epoch': 2.0}


                                                     
100%|██████████| 9256/9256 [4:02:13<00:00,  1.57s/it]


{'eval_loss': 0.565261960029602, 'eval_runtime': 723.0026, 'eval_samples_per_second': 5.69, 'eval_steps_per_second': 0.712, 'epoch': 2.0}
{'train_runtime': 14533.3383, 'train_samples_per_second': 5.094, 'train_steps_per_second': 0.637, 'train_loss': 0.120278679346384, 'epoch': 2.0}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 50%|████▉     | 4627/9256 [5:06:55<4:51:46,  3.78s/it] 

{'loss': 0.5224, 'learning_rate': 2.5005401901469318e-05, 'epoch': 1.0}


                                                       
 50%|█████     | 4628/9256 [5:17:53<4:19:53,  3.37s/it]

{'eval_loss': 0.46306777000427246, 'eval_runtime': 655.5534, 'eval_samples_per_second': 6.276, 'eval_steps_per_second': 0.786, 'epoch': 1.0}


100%|█████████▉| 9254/9256 [10:25:43<00:07,  3.68s/it]    

{'loss': 0.3674, 'learning_rate': 1.08038029386344e-08, 'epoch': 2.0}


                                                      
100%|██████████| 9256/9256 [10:36:57<00:00,  4.13s/it]

{'eval_loss': 0.5177285075187683, 'eval_runtime': 667.6662, 'eval_samples_per_second': 6.162, 'eval_steps_per_second': 0.771, 'epoch': 2.0}
{'train_runtime': 38217.5048, 'train_samples_per_second': 1.937, 'train_steps_per_second': 0.242, 'train_loss': 0.44492392251614155, 'epoch': 2.0}





TrainOutput(global_step=9256, training_loss=0.44492392251614155, metrics={'train_runtime': 38217.5048, 'train_samples_per_second': 1.937, 'train_steps_per_second': 0.242, 'train_loss': 0.44492392251614155, 'epoch': 2.0})

# Saving Model and Tokenizer

In [53]:
# Save the trained model
model.save_pretrained("./model")

# Save the tokenizer
tokenizer.save_pretrained("./model")


('./model\\tokenizer_config.json',
 './model\\special_tokens_map.json',
 './model\\vocab.txt',
 './model\\added_tokens.json')

# Evaluating the Model

In [55]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np 
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {
        'accuracy': accuracy_score(p.label_ids, preds),
        'precision': precision_recall_fscore_support(p.label_ids, preds, average='weighted')[0],
        'recall': precision_recall_fscore_support(p.label_ids, preds, average='weighted')[1],
        'f1': precision_recall_fscore_support(p.label_ids, preds, average='weighted')[2]
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics  # Add this line
)

# Evaluate the model
results = trainer.evaluate()

print(results)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
100%|██████████| 515/515 [10:30<00:00,  1.22s/it]

{'eval_loss': 0.5177285075187683, 'eval_accuracy': 0.7802625182304327, 'eval_precision': 0.7788780510518958, 'eval_recall': 0.7802625182304327, 'eval_f1': 0.7795105220319122, 'eval_runtime': 632.1499, 'eval_samples_per_second': 6.508, 'eval_steps_per_second': 0.815}



