In [1]:
import joblib
import numpy as np
import pandas as pd
import re
import string
import torch

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss, RandomUnderSampler
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from transformers import AutoTokenizer, AutoModel, pipeline

In [2]:
# Helper methods
def train_nn(X_train, y_train):
    clf = MLPClassifier(
        hidden_layer_sizes=(256, 64, 16), max_iter=150, activation='relu', 
        solver='adam', alpha=1e-5, early_stopping=True)
    clf.fit(X_train, y_train)
    return clf

def print_results(X_train, X_test, y_train, y_test):
    print("Train acc:\t{0:.3f}".format(clf.score(X_train, y_train)))
    print("Test acc:\t{0:.3f}".format(clf.score(X_test, y_test)))
    print(classification_report(y_train, clf.predict(X_train)))
    print(classification_report(y_test, clf.predict(X_test)))
    print(confusion_matrix(y_test, clf.predict(X_test)))

In [3]:
labels = pd.read_csv("sinkaf/data/troff-v1.0.tsv",  sep='\t')['label'] != 'non'
print(labels.value_counts())

False    28439
True      6845
Name: label, dtype: int64


In [4]:
# Onceden egitilmis Turkce BERT modeli kullanilarak deney derlemi vektore cevirilmistir
# https://drive.google.com/file/d/1fq_Vkvg0QFpZaG1EgwdhyXYNSLqdu2tq/view?usp=sharing
# Dosyayi indirip sinkaf/data'ya kopyalayiniz 
bert_data = pd.read_csv("sinkaf/data/bert_data.csv", header=None)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(bert_data, labels)

In [6]:
# Az olan siniftaki degerlerden ornek uretimi
smote = SMOTE(sampling_strategy=1)
X_train, y_train = smote.fit_sample(X_train, y_train)

In [7]:
# basit NN egitimi
clf = train_nn(X_train, y_train)

In [8]:
# Deney setinde %97, test setinde %82'a yakin basari olmasina ragmen
# model kufur iceren cumlelerde dusuk basari gostermektedir. Test setindeki
# veri dengesizligi sebebi ile basarisi yuksek cikmaktadir
print_results(X_train, X_test, y_train, y_test)

Train acc:	0.986
Test acc:	0.815
              precision    recall  f1-score   support

       False       0.99      0.98      0.99     21308
        True       0.98      0.99      0.99     21308

    accuracy                           0.99     42616
   macro avg       0.99      0.99      0.99     42616
weighted avg       0.99      0.99      0.99     42616

              precision    recall  f1-score   support

       False       0.89      0.88      0.88      7131
        True       0.52      0.54      0.53      1690

    accuracy                           0.81      8821
   macro avg       0.70      0.71      0.71      8821
weighted avg       0.82      0.81      0.82      8821

[[6274  857]
 [ 777  913]]


In [9]:
# Kufurlu veriyi aza ornekleme
# Undersampling non offensive data
undersampler = RandomUnderSampler()
bert_under, labels_under = undersampler.fit_resample(bert_data, labels)
print(f"Normal length: {len(bert_data)}, Undersampled length: {len(bert_under)}")

Normal length: 35284, Undersampled length: 13690


In [10]:
# Aza orneklenen model denemesi
X_train, X_test, y_train, y_test = train_test_split(bert_under, labels_under, stratify=labels_under)
clf = train_nn(X_train, y_train)
print_results(X_train, X_test, y_train, y_test)

Train acc:	0.836
Test acc:	0.773
              precision    recall  f1-score   support

       False       0.85      0.81      0.83      5133
        True       0.82      0.86      0.84      5134

    accuracy                           0.84     10267
   macro avg       0.84      0.84      0.84     10267
weighted avg       0.84      0.84      0.84     10267

              precision    recall  f1-score   support

       False       0.79      0.74      0.77      1712
        True       0.76      0.80      0.78      1711

    accuracy                           0.77      3423
   macro avg       0.77      0.77      0.77      3423
weighted avg       0.77      0.77      0.77      3423

[[1271  441]
 [ 335 1376]]


In [11]:
# Final model hazirlanmasi
# Aza orneklenen butun veri kullanilmistir
clf = train_nn(bert_under, labels_under)
print("Train acc:\t{0:.3f}".format(clf.score(bert_under, labels_under)))
print(classification_report(labels_under, clf.predict(bert_under)))

Train acc:	0.784
              precision    recall  f1-score   support

       False       0.75      0.86      0.80      6845
        True       0.83      0.71      0.77      6845

    accuracy                           0.78     13690
   macro avg       0.79      0.78      0.78     13690
weighted avg       0.79      0.78      0.78     13690



In [12]:
# Pre-trained BERT kullanarak vektore cevirme
# Tek seferlik bir yukleme yapicak
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-128k-uncased")
bert = AutoModel.from_pretrained("dbmdz/bert-base-turkish-128k-uncased")

In [13]:
# Cumleleri vektore cevirirken eksik kisimlarda maximum uzunluga gore padding yapilmistir
# Max uzunluk sinkaf datasi icin 113
MAX_SENTENCE_TOKEN_LENGTH = 113

In [14]:
# Offensive? - Kufur mu?

test = [
    "guzel karisin ha", 
    "cok guzelsin", 
    "yaz transfer sezonuna lionel messi damga vuracak gibi gözüküyor", 
    "dal sarkar kartal kalkar",
    "amk cocugu",
    "aq bebesindeki havaya bak sen",
    "kral cocuk bizim alper",
    "erol bulut istifa",
    "sen kendini ne saniyorsun kopek"]

In [15]:
tokenized = [tokenizer.encode(s, add_special_tokens=True) for s in test]
padded = np.array([s + [0]*(MAX_SENTENCE_TOKEN_LENGTH-len(s)) for s in tokenized])
input_ids = torch.tensor(np.array(padded)).to(torch.int64)

In [16]:
# Cumleleri vektore cevirme
def sentence_2_vec(input_id):
    with torch.no_grad():
        last_hidden_states = bert(input_id)
        features = last_hidden_states[0][:,0,:].numpy()
    return features

In [17]:
test_vector = sentence_2_vec(input_ids)
clf.predict(test_vector)

array([ True, False, False, False,  True,  True, False, False,  True])

In [18]:
#dump it (higher), dump it (higher!)
joblib.dump(clf, "sinkaf/data/clf_nn.joblib")

['sinkaf/data/clf_nn.joblib']