In [1]:
import numpy as np
import pandas as pd
import re
from timeit import default_timer as timer
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ever/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
pip install -U sentence-transformers


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
import torch
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [5]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
test_y = pd.read_csv("data/test_labels.csv")

In [6]:
def clean_text(text):
    patterns = [
        r'http\S+',
        r'\bhttps?://[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)+\b',
        r'[^\w\s]', 
        r'\d',
        r'[\u4e00-\u9fff]+',
    ]
    for pattern in patterns:
        text = re.sub(pattern, ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [7]:
train['comment_text'] = train['comment_text'].map(clean_text)
train['sentences'] = train['comment_text'].map(sent_tokenize)
test['comment_text'] = test['comment_text'].map(clean_text)
test['sentences'] = test['comment_text'].map(sent_tokenize)

In [8]:
train['sentences'].head()

0    [Explanation Why the edits made under my usern...
1    [D aww He matches this background colour I m s...
2    [Hey man I m really not trying to edit war It ...
3    [More I can t make any real suggestions on imp...
4    [You sir are my hero Any chance you remember w...
Name: sentences, dtype: object

In [9]:
test['sentences'].head()

0    [Yo bitch Ja Rule is more succesful then you l...
1            [From RfC The title is fine as it is IMO]
2                     [Sources Zawe Ashton on Lapland]
3    [If you have a look back at the source the inf...
4           [I don t anonymously edit articles at all]
Name: sentences, dtype: object

In [10]:
model = SentenceTransformer("all-MiniLM-L6-v2", device=device)
sentences = train['sentences'].apply(lambda x: (x, '') if x is not None else ([], '')).tolist()

batch_size = 32
embeddings = []
with tqdm(total=len(sentences), desc="Encoding", leave=False, dynamic_ncols=True) as pbar:
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i + batch_size]
        batch_embeddings = model.encode(batch, convert_to_tensor=True)
        embeddings.append(batch_embeddings)
        pbar.update(len(batch))

embeddings = torch.cat(embeddings, dim=0)

                                                                                                                        

In [11]:
embeddings.shape

torch.Size([159571, 384])

In [12]:
class_labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
X = embeddings.cpu().numpy()
y = train[class_labels]
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, random_state=42)

In [13]:
mlb = MultiLabelBinarizer()
y_train_bin= mlb.fit_transform(y_train)
y_valid_bin = mlb.fit_transform(y_valid)

In [14]:
y_train_binary = y_train[class_labels].values.astype(int)
y_valid_binary = y_valid[class_labels].values.astype(int)

In [28]:
clf = MLPClassifier(
    hidden_layer_sizes=(64, 32, 16, 6),
    activation='relu',
    solver='adam',
    random_state=42,
    verbose=True,
    learning_rate_init=0.001,
    max_iter=50,
    early_stopping=True,
    n_iter_no_change=5,
)

# Train the classifier
clf.fit(X_train, y_train_binary)

Iteration 1, loss = 0.85336734
Validation score: 0.909839
Iteration 2, loss = 0.34842739
Validation score: 0.914225
Iteration 3, loss = 0.33131532
Validation score: 0.913050
Iteration 4, loss = 0.32121517
Validation score: 0.915322
Iteration 5, loss = 0.31263576
Validation score: 0.915557
Iteration 6, loss = 0.30526232
Validation score: 0.913912
Iteration 7, loss = 0.29738693
Validation score: 0.915322
Iteration 8, loss = 0.29201498
Validation score: 0.914539
Iteration 9, loss = 0.28611544
Validation score: 0.915479
Iteration 10, loss = 0.28031981
Validation score: 0.913442
Iteration 11, loss = 0.27498306
Validation score: 0.916967
Iteration 12, loss = 0.27012680
Validation score: 0.911562
Iteration 13, loss = 0.26524471
Validation score: 0.913207
Iteration 14, loss = 0.26060188
Validation score: 0.914147
Iteration 15, loss = 0.25676832
Validation score: 0.913364
Iteration 16, loss = 0.25113369
Validation score: 0.914617
Iteration 17, loss = 0.24608546
Validation score: 0.907645
Valida

MLPClassifier(early_stopping=True, hidden_layer_sizes=(64, 32, 16, 6),
              max_iter=50, n_iter_no_change=5, random_state=42, verbose=True)

In [16]:
sentences = test['sentences'].apply(lambda x: (x, '') if x is not None else ([], '')).tolist()

batch_size = 32
test_embeddings = []
with tqdm(total=len(sentences), desc="Encoding", leave=False, dynamic_ncols=True) as pbar:
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i + batch_size]
        batch_embeddings = model.encode(batch, convert_to_tensor=True)
        test_embeddings.append(batch_embeddings)
        pbar.update(len(batch))

test_embeddings = torch.cat(test_embeddings, dim=0)

                                                                                                                        

In [17]:
test_embeddings.shape

torch.Size([153164, 384])

In [29]:
y_test_prob = clf.predict_proba(test_embeddings.cpu().numpy())

In [30]:
np.set_printoptions(suppress=True)
print("y_test_prob: ", y_test_prob)

y_test_prob:  [[0.94930726 0.42651013 0.93091196 0.06396484 0.819107   0.38829646]
 [0.00005219 0.00000001 0.00000481 0.00000026 0.00000344 0.00000055]
 [0.00013198 0.00000003 0.00000928 0.00000096 0.00000622 0.00000153]
 ...
 [0.0001021  0.00000002 0.00003498 0.00000009 0.00000922 0.00000067]
 [0.00441968 0.00003071 0.00052268 0.0004029  0.00054803 0.0006298 ]
 [0.9231681  0.00589816 0.7931181  0.00025852 0.40623775 0.00238886]]


In [31]:
id = pd.DataFrame({'id': test.index})  # Use index as 'id' column
submission = pd.concat([id, pd.DataFrame(y_test_prob, columns=class_labels)], axis=1)
submission.to_csv('submission.csv', index=False)

In [32]:
submission.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0,0.949307,0.4265101,0.930912,0.06396484,0.819107,0.3882965
1,1,5.2e-05,1.072119e-08,4.805834e-06,2.645486e-07,3.437027e-06,5.519983e-07
2,2,0.000132,3.304054e-08,9.283026e-06,9.581056e-07,6.221908e-06,1.526719e-06
3,3,2e-06,3.89477e-11,4.4917e-08,7.13182e-09,8.381811e-08,5.437568e-09
4,4,0.010063,3.674641e-06,0.0003978441,2.260107e-05,0.001194101,2.516105e-05


the private score is 96.83% on the Kaggle test set