In [1]:
import pandas as pd
from pathlib import Path

In [14]:
def select_feature():
    df = pd.read_csv("D:\\end_to_end_sentiment_ml\\data\\raw\\raw_dataset.csv")
    df = df[['comment_text', 'toxicity']]
    return df

In [15]:
df.head()

Unnamed: 0,comment_text,toxicity
201,Make no mistake this pope is a dictator.\n\nHe...,0.5
111703,A simple google search would prove that your p...,0.0
42989,Money talks b.s. Walks!!! This guy doesn't kno...,0.0
17437,Please cite your sources for these things that...,0.0
45448,"The regime change in Syria, unlike most of the...",0.0


1. Structural Cleaning
   
Drop empty or NaN rows.

Remove exact duplicates (strict duplicates only).

Fix encoding (UTF-8).

In [16]:
def structural_cleaning(df):
    df_out = df.copy()

    # Drop na rows
    df_out = df_out.dropna()

    # Remove exact dupliates
    df_out = df_out.drop_duplicates()

    # Fix encoding, ensure str type
    df_out['comment_text'] = df_out['comment_text'].astype(str)

    # Ensure float type for the target column
    df_out['toxicity'] = df['toxicity'].astype(float)

    return df_out

2. Noise Removal / Replacement
   
Replace URLs → [URL].

Replace mentions → [USER].

Process hashtags (#awesome → awesome).

Strip HTML/Markdown.

Remove invisible/non-printable chars.

In [17]:
import re

def replace_mention(text):
    words = text.split()
    for i in range(len(words)):
        if i != 0:
            if words[i][0].isupper():
                words[i] = '[MENTION]'
    return ' '.join(words)

# text = 'Hello my name is Long'
# print(replace_mention(text))

def noise_removal(df):
    df_out = df.copy()

    # remove URLs -> [URL]
    df_out['comment_text'] = df_out['comment_text'].apply(lambda x: re.sub(r'http\S+', '<URL>', x))

    # Replace mentions → [MENTION].
    df_out['comment_text'] = df_out['comment_text'].apply(replace_mention)

    # Process hashtags (#awesome → awesome).
    df_out['comment_text'] = df_out['comment_text'].apply(lambda x: x.replace('#', ''))

    # strip html
    df_out['comment_text'] = df_out['comment_text'].apply(lambda x: re.sub(r'<.*?>', '<HTML>', x))

    # Remove invisible/non-printable chars
    df_out['comment_text'] = df_out['comment_text'].apply(lambda x: re.sub(r'[\x00-\x1f\ufeff\u200b]', '', x))

    # Replace number with [NUMBER]
    df_out['comment_text'] = df_out['comment_text'].apply(lambda x: re.sub(r'\d+', '<NUMBER>', x))

    return df_out
    

3. Contractions & Slang

Expand contractions (don’t → do not).

Normalize slang (if domain requires).

In [18]:
pip install contractions

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [19]:
import contractions

def expand_contraction(text):
    words = text.split()
    expanded_words = []
    for word in words:
      expanded_words.append(contractions.fix(word, slang = True))
    return ' '.join(expanded_words)
        

def contraction_slang(df):
    df_out = df.copy()

    df_out['comment_text'] = df_out['comment_text'].apply(expand_contraction)

    return df_out

    

4. Text Normalization

Lowercasing (if using bert-base-uncased).

Normalize whitespace.

Normalize punctuation (!!!! → !).

Normalize repeated characters (soooo → soo).

Unicode normalization (curly quotes → straight quotes).

In [20]:
pip install unicodedata

Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement unicodedata (from versions: none)

[notice] A new release of pip is available: 25.0 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: No matching distribution found for unicodedata


In [21]:
import unicodedata

def text_normalization(df):
    df_out = df.copy()

    # lowercaseing
    df_out['comment_text'] = df['comment_text'].apply(lambda x: x.lower())

    # normalize whitespace
    df_out['comment_text'] = df['comment_text'].apply(lambda x: ' '.join(x.strip().split()))

    # normalize punctuation
    df_out['comment_text'] = df['comment_text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

    # normalize repeated characters
    df_out['comment_text'] = df['comment_text'].apply(lambda x: re.sub(r'(\w)\1{2,}', r'\1\1', x))

    # Unicode normalization (curly quotes → straight quotes).
    df_out['comment_text'] = df_out['comment_text'].apply(lambda x: unicodedata.normalize('NFKC', x))
    
    return df_out

In [22]:
df = select_feature()

df = df.sample(frac=0.1, random_state=42)

df = structural_cleaning(df)

df = noise_removal(df)

df = contraction_slang(df)

df = text_normalization(df)

In [23]:
df

Unnamed: 0,comment_text,toxicity
201,Make no mistake this pope is a dictator. [MENT...,0.5000
111703,A simple google search would prove that your p...,0.0000
42989,Money talks b.s. [MENTION] [MENTION] guy does ...,0.0000
17437,Please cite your sources for these things that...,0.0000
45448,The regime change in [MENTION] unlike most of ...,0.0000
...,...,...
105946,"This ""article"" should have come with a disclos...",0.0000
114826,OMG [MENTION] [MENTION] [MENTION] [MENTION] [M...,0.4000
26829,You got that? [MENTION] is how a pathetic narc...,0.6125
39217,And [MENTION] [MENTION] forgot to mention [MEN...,0.6000


Load Bert Tokenizer

In [24]:
pip install transformers

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [25]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

MAX_LEN = 128

encoded_text = tokenizer(text = list(df['comment_text']),
                         padding = 'max_length',
                         truncation = True,
                         max_length=MAX_LEN,
                         return_tensors='pt')

  from .autonotebook import tqdm as notebook_tqdm


In [26]:
import torch
import numpy as np

input_ids = encoded_text['input_ids']

attention_mask = encoded_text['attention_mask']

labels = torch.from_numpy(df['toxicity'].to_numpy(dtype = np.float32))

In [27]:
print(input_ids.shape)
print(attention_mask.shape)
print(labels.shape)
print(len(input_ids))

torch.Size([11978, 128])
torch.Size([11978, 128])
torch.Size([11978])
11978


Create dataset

In [28]:
from torch.utils.data import Dataset

class TextDataset(Dataset):
    def __init__(self, input_ids, attention_mask, labels):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)
    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }

In [29]:
dataset = TextDataset(input_ids = input_ids, attention_mask = attention_mask, labels = labels)

In [30]:
from torch.utils.data import DataLoader

dataloader = DataLoader(dataset, batch_size = 64, shuffle = True)

In [31]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=1
)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
epochs = 2

for epoch in range(epochs):
    total_loss = 0

    model.train()

    for batch_idx, data in enumerate(dataloader):
        optimizer.zero_grad()
        
        input_ids = data['input_ids']
        attention_mask = data['attention_mask']
        labels = data['labels']

        outputs = model(input_ids = input_ids, attention_mask = attention_mask, labels = labels)

        loss = outputs.loss

        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        print('Epoch: {}, Batch: {}, Loss: {}'.format(epoch, batch_idx, loss))

    print('Epoch: {}, Total Loss: {}'.format(epoch + 1, total_loss/ len(dataloader)))

Naive Bayes with TFIDF

In [60]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

tfidf_model = Pipeline([
    ('tfidf', TfidfVectorizer(        
        lowercase=True,
        stop_words="english",
        ngram_range=(1,2))),
    ('MultinomialNB', MultinomialNB())
])



In [61]:
df.head()

Unnamed: 0,comment_text,toxicity
201,Make no mistake this pope is a dictator. [MENT...,0.5
111703,A simple google search would prove that your p...,0.0
42989,Money talks b.s. [MENTION] [MENTION] guy does ...,0.0
17437,Please cite your sources for these things that...,0.0
45448,The regime change in [MENTION] unlike most of ...,0.0


In [96]:
threshold = 0.1

df_binary_toxicity = (df['toxicity']>threshold).astype(int)

In [97]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['comment_text'], df_binary_toxicity, random_state = 42, stratify = df_binary_toxicity)

In [98]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(8983,)
(8983,)
(2995,)
(2995,)


In [99]:
tfidf_model.fit(X_train, y_train)

0,1,2
,steps,"[('tfidf', ...), ('MultinomialNB', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [100]:
tfidf_model.score(X_test, y_test)

0.7245409015025042

In [101]:
from sklearn.metrics import classification_report

print(classification_report(y_test, tfidf_model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.71      0.80      0.75      1546
           1       0.75      0.64      0.69      1449

    accuracy                           0.72      2995
   macro avg       0.73      0.72      0.72      2995
weighted avg       0.73      0.72      0.72      2995



In [102]:
from sklearn.svm import SVC

SVC_model = Pipeline([
    ('tfidf', TfidfVectorizer(
        lowercase = True,
        stop_words = 'english',
        ngram_range = (1, 2)
    )),
    ('svc', SVC())
])

In [103]:
SVC_model.fit(X_train, y_train)

0,1,2
,steps,"[('tfidf', ...), ('svc', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [104]:
print(classification_report(y_test, SVC_model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.70      0.81      0.75      1546
           1       0.76      0.62      0.68      1449

    accuracy                           0.72      2995
   macro avg       0.73      0.72      0.72      2995
weighted avg       0.73      0.72      0.72      2995

