In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
tqdm.pandas()
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
from sentence_transformers import SentenceTransformer, util

In [2]:
df = pd.read_csv('data/df_folds.csv')

In [3]:
df['text_len'] = df.text.progress_apply(lambda x: len(x))
df = df[df.text_len>=3]
df = df[~((df.target=='OTHER')&(df.is_offensive==1))]
df = df.reset_index(drop=True)

  0%|          | 0/12617 [00:00<?, ?it/s]

In [4]:
df

Unnamed: 0,id,text,is_offensive,target,fold,text_len
0,81c11060-a240-4d54-841b-9e2916039e85,çürük dişli,1,INSULT,2,11
1,be80ebbf-b322-4c3b-afa1-94932ea80731,Bu adamın islama ve müslümanlara verdiği zarar...,1,RACIST,0,67
2,f99e2513-83ed-4076-ac72-b9e2cff3f049,erkekler zora gelmez,1,SEXIST,1,20
3,83ed2b2e-b815-4f36-9fc4-80a9050cf2d0,Utanmazın götüne kazık sokmuşlar bu tıkırtı ne...,1,PROFANITY,3,65
4,d93e05f7-bfdd-4cdb-99d8-3048761b30ff,otomasyon< sistemlerine= doğrudan bağlanabilir,0,OTHER,1,46
...,...,...,...,...,...,...
12406,71eedfa1-8fa6-425c-b982-258c3b29c003,uyuma taklidi yapan tehlikeli bir hayvanın göz...,0,OTHER,0,63
12407,b38eed16-6501-4563-8b33-ff2e634bb8e5,yolda at kavga eden üç oğlan çocuğu görür,0,OTHER,3,41
12408,c8a051a8-94ef-4b64-a48e-54d0fa4f8323,sizin köpeklerinizin burnu bile daha iyi koku ...,0,OTHER,0,66
12409,513a7e6d-4207-4a16-9b47-972f26e23cfe,hayalleri gerçek etmek için birisinin delilik ...,0,OTHER,2,90


In [5]:
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

In [6]:
embeddings = model.encode(df['text'], convert_to_tensor=False,batch_size=16,show_progress_bar=True)

Batches:   0%|          | 0/776 [00:00<?, ?it/s]

In [7]:
target_dict = {'OTHER':0,'PROFANITY':1,'SEXIST':2,'RACIST':3,'INSULT':4}
df['target'] = df['target'].map(target_dict)
y = df['target']

In [11]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score

n_classes = 5
# Define hyperparameters for the LightGBM model
params = {
    'objective': 'multiclass',
    'num_class': n_classes,
    'metric': 'multi_logloss',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.008,
    'feature_fraction': 0.1,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'random_state': 42
}

# Initialize empty arrays to store out-of-fold predictions
oof_preds = np.zeros((y.shape[0], n_classes))

# Define k-fold cross-validation strategy
n_splits = df.fold.nunique()
val_scores = []
# Loop over the folds
for fold in range(n_splits):
    print(f'Fold {fold+1}/{n_splits}')
    val_idx = df[df.fold == fold].index
    train_idx = df[df.fold != fold].index
    
    # Split data into train and validation sets for this fold
    X_train, y_train = embeddings[train_idx], y[train_idx]
    X_val, y_val = embeddings[val_idx], y[val_idx]

    # Create LightGBM datasets
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_val = lgb.Dataset(X_val, y_val)

    # Train LightGBM model for this fold
    model = lgb.train(params,
                      lgb_train,
                      num_boost_round=10_000,
                      valid_sets=[lgb_train, lgb_val],
                      verbose_eval=100, early_stopping_rounds=100)

    # Make predictions for validation set and store in oof_preds array
    oof_preds[val_idx, :] = model.predict(X_val, num_iteration=model.best_iteration)
    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    val_score = f1_score(y_val, np.argmax(val_pred,axis=1), average='macro')
    val_scores.append(val_score)
    print(f'Fold {fold+1} F1: {val_score:.4f}')
    print('*'*50)
# Compute out-of-fold log loss score
oof_score = f1_score(y, np.argmax(oof_preds,axis=1), average='macro')

print(f'OOF F1: {oof_score:.4f}')

Fold 1/5
Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.929662	valid_1's multi_logloss: 1.03404
[200]	training's multi_logloss: 0.661691	valid_1's multi_logloss: 0.825287
[300]	training's multi_logloss: 0.511279	valid_1's multi_logloss: 0.721933
[400]	training's multi_logloss: 0.410465	valid_1's multi_logloss: 0.66495
[500]	training's multi_logloss: 0.335065	valid_1's multi_logloss: 0.628133
[600]	training's multi_logloss: 0.276237	valid_1's multi_logloss: 0.602806
[700]	training's multi_logloss: 0.22858	valid_1's multi_logloss: 0.583189
[800]	training's multi_logloss: 0.190064	valid_1's multi_logloss: 0.568938
[900]	training's multi_logloss: 0.158476	valid_1's multi_logloss: 0.558436
[1000]	training's multi_logloss: 0.132503	valid_1's multi_logloss: 0.54982
[1100]	training's multi_logloss: 0.111134	valid_1's multi_logloss: 0.543438
[1200]	training's multi_logloss: 0.0934583	valid_1's multi_logloss: 0.538273
[1300]	training's multi_loglo



Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.930104	valid_1's multi_logloss: 1.0316
[200]	training's multi_logloss: 0.661231	valid_1's multi_logloss: 0.823224
[300]	training's multi_logloss: 0.510008	valid_1's multi_logloss: 0.724032
[400]	training's multi_logloss: 0.408704	valid_1's multi_logloss: 0.6688
[500]	training's multi_logloss: 0.333308	valid_1's multi_logloss: 0.634726
[600]	training's multi_logloss: 0.27442	valid_1's multi_logloss: 0.611539
[700]	training's multi_logloss: 0.226965	valid_1's multi_logloss: 0.594598
[800]	training's multi_logloss: 0.188742	valid_1's multi_logloss: 0.582952
[900]	training's multi_logloss: 0.157401	valid_1's multi_logloss: 0.573376
[1000]	training's multi_logloss: 0.131629	valid_1's multi_logloss: 0.566494
[1100]	training's multi_logloss: 0.110332	valid_1's multi_logloss: 0.560905
[1200]	training's multi_logloss: 0.0927284	valid_1's multi_logloss: 0.557073
[1300]	training's multi_logloss: 0.0781



Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.933289	valid_1's multi_logloss: 1.02087
[200]	training's multi_logloss: 0.665744	valid_1's multi_logloss: 0.808531
[300]	training's multi_logloss: 0.515152	valid_1's multi_logloss: 0.705066
[400]	training's multi_logloss: 0.413387	valid_1's multi_logloss: 0.6477
[500]	training's multi_logloss: 0.337619	valid_1's multi_logloss: 0.610901
[600]	training's multi_logloss: 0.278479	valid_1's multi_logloss: 0.585822
[700]	training's multi_logloss: 0.230835	valid_1's multi_logloss: 0.567237
[800]	training's multi_logloss: 0.192092	valid_1's multi_logloss: 0.553602
[900]	training's multi_logloss: 0.160334	valid_1's multi_logloss: 0.54336
[1000]	training's multi_logloss: 0.134243	valid_1's multi_logloss: 0.53509
[1100]	training's multi_logloss: 0.1127	valid_1's multi_logloss: 0.528238
[1200]	training's multi_logloss: 0.0948619	valid_1's multi_logloss: 0.523655
[1300]	training's multi_logloss: 0.080023



Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.929552	valid_1's multi_logloss: 1.03334
[200]	training's multi_logloss: 0.660896	valid_1's multi_logloss: 0.82362
[300]	training's multi_logloss: 0.510684	valid_1's multi_logloss: 0.722941
[400]	training's multi_logloss: 0.409325	valid_1's multi_logloss: 0.666764
[500]	training's multi_logloss: 0.333946	valid_1's multi_logloss: 0.630899
[600]	training's multi_logloss: 0.275065	valid_1's multi_logloss: 0.606814
[700]	training's multi_logloss: 0.227686	valid_1's multi_logloss: 0.588707
[800]	training's multi_logloss: 0.189109	valid_1's multi_logloss: 0.57522
[900]	training's multi_logloss: 0.157636	valid_1's multi_logloss: 0.564744
[1000]	training's multi_logloss: 0.131769	valid_1's multi_logloss: 0.557012
[1100]	training's multi_logloss: 0.110489	valid_1's multi_logloss: 0.551572
[1200]	training's multi_logloss: 0.0928389	valid_1's multi_logloss: 0.547343
[1300]	training's multi_logloss: 0.07



Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.930179	valid_1's multi_logloss: 1.0321
[200]	training's multi_logloss: 0.662297	valid_1's multi_logloss: 0.822556
[300]	training's multi_logloss: 0.51229	valid_1's multi_logloss: 0.720818
[400]	training's multi_logloss: 0.410929	valid_1's multi_logloss: 0.661712
[500]	training's multi_logloss: 0.335588	valid_1's multi_logloss: 0.624356
[600]	training's multi_logloss: 0.276852	valid_1's multi_logloss: 0.598915
[700]	training's multi_logloss: 0.229393	valid_1's multi_logloss: 0.580427
[800]	training's multi_logloss: 0.190796	valid_1's multi_logloss: 0.56647
[900]	training's multi_logloss: 0.159193	valid_1's multi_logloss: 0.55597
[1000]	training's multi_logloss: 0.133104	valid_1's multi_logloss: 0.54711
[1100]	training's multi_logloss: 0.111651	valid_1's multi_logloss: 0.540094
[1200]	training's multi_logloss: 0.0938855	valid_1's multi_logloss: 0.534834
[1300]	training's multi_logloss: 0.07910

In [12]:
print(f'OOF F1: {oof_score:.4f}')
print(f'CV F1 mean: {np.mean(val_scores):.4f}')
print(f'CV F1 std: {np.std(val_scores):.4f}')

OOF F1: 0.8052
CV F1 mean: 0.8051
CV F1 std: 0.0097


In [15]:
print(classification_report(y, np.argmax(oof_preds,axis=1),target_names=target_dict.keys(),digits=3))

              precision    recall  f1-score   support

       OTHER      0.847     0.888     0.867      3528
   PROFANITY      0.745     0.753     0.749      2376
      SEXIST      0.896     0.872     0.884      2081
      RACIST      0.854     0.759     0.804      2033
      INSULT      0.710     0.736     0.722      2393

    accuracy                          0.809     12411
   macro avg      0.810     0.801     0.805     12411
weighted avg      0.810     0.809     0.809     12411

