In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report
from tqdm import tqdm
from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForSequenceClassification, DataCollatorWithPadding
from datasets import Dataset

In [2]:
df = pd.read_csv("../data/raw/teknofest_train_final.csv",
                 sep="|")
df["pred"] = "nan"
df.head()

Unnamed: 0,id,text,is_offensive,target,pred
0,81c11060-a240-4d54-841b-9e2916039e85,çürük dişli,1,INSULT,
1,be80ebbf-b322-4c3b-afa1-94932ea80731,Bu adamın islama ve müslümanlara verdiği zarar...,1,RACIST,
2,f99e2513-83ed-4076-ac72-b9e2cff3f049,erkekler zora gelmez,1,SEXIST,
3,83ed2b2e-b815-4f36-9fc4-80a9050cf2d0,Utanmazın götüne kazık sokmuşlar bu tıkırtı ne...,1,PROFANITY,
4,d93e05f7-bfdd-4cdb-99d8-3048761b30ff,otomasyon< sistemlerine= doğrudan bağlanabilir,0,OTHER,


In [3]:
skf = StratifiedKFold(n_splits=5,
                      shuffle=True,
                      random_state=1337)

splits = list(skf.split(df, df["target"]))

In [4]:
for train_idx, val_idx in tqdm(splits):
    X_train, y_train = df["text"].iloc[train_idx], df["target"].iloc[train_idx]
    X_val, y_val = df["text"].iloc[val_idx], df["target"].iloc[val_idx]
    
    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform(X_train)
    X_val = vectorizer.transform(X_val)
    
    model = LGBMClassifier()
    model.fit(X_train, y_train)
    
    preds = model.predict(X_val)
    df.loc[val_idx, "pred"] = preds

100%|█████████████████████████████████████████████| 5/5 [00:03<00:00,  1.51it/s]


In [5]:
print(classification_report(df["target"], df["pred"]))

              precision    recall  f1-score   support

      INSULT       0.59      0.55      0.57      2419
       OTHER       0.54      0.81      0.65      3616
   PROFANITY       0.88      0.60      0.71      2398
      RACIST       0.76      0.51      0.61      2072
      SEXIST       0.83      0.77      0.80      2112

    accuracy                           0.66     12617
   macro avg       0.72      0.65      0.67     12617
weighted avg       0.70      0.66      0.66     12617

