In [1]:
import numpy as np
import pandas as pd
import re
import string
import torch

from imblearn.over_sampling import SMOTE
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, AutoModel, pipeline

In [2]:
labels = pd.read_csv("sinkaf/data/troff-v1.0.tsv",  sep='\t')['label'] != 'non'
print(labels.value_counts())

False    28439
True      6845
Name: label, dtype: int64


In [3]:
# Onceden egitilmis Turkce BERT modeli kullanilarak deney derlemi vektore cevirilmistir
# https://drive.google.com/file/d/1fq_Vkvg0QFpZaG1EgwdhyXYNSLqdu2tq/view?usp=sharing
# Dosyayi indirip sinkaf/data'ya kopyalayiniz 
bert_data = pd.read_csv("sinkaf/data/bert_data.csv", header=None)

In [4]:
# Az olan siniftaki degerlerden ornek uretimi
smote = SMOTE(sampling_strategy=1)
bert_data, labels = smote.fit_sample(bert_data, labels)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(bert_data, labels)

In [6]:
# basit NN egitimi
clf = MLPClassifier(
    hidden_layer_sizes=(128, 64, 32), max_iter=150, activation='relu', 
    solver='adam', alpha=1e-5, verbose=1, early_stopping=True)
clf.fit(X_train, y_train)

Iteration 1, loss = 0.53066189
Validation score: 0.781997
Iteration 2, loss = 0.46357644
Validation score: 0.799812
Iteration 3, loss = 0.44187904
Validation score: 0.802157
Iteration 4, loss = 0.43013017
Validation score: 0.805438
Iteration 5, loss = 0.42080317
Validation score: 0.803797
Iteration 6, loss = 0.40795082
Validation score: 0.808251
Iteration 7, loss = 0.39980172
Validation score: 0.815752
Iteration 8, loss = 0.39064890
Validation score: 0.761369
Iteration 9, loss = 0.38905256
Validation score: 0.816456
Iteration 10, loss = 0.37666436
Validation score: 0.816456
Iteration 11, loss = 0.36632882
Validation score: 0.823254
Iteration 12, loss = 0.35575864
Validation score: 0.832630
Iteration 13, loss = 0.34988470
Validation score: 0.832865
Iteration 14, loss = 0.34865826
Validation score: 0.814580
Iteration 15, loss = 0.33696291
Validation score: 0.827239
Iteration 16, loss = 0.32857582
Validation score: 0.841069
Iteration 17, loss = 0.31994413
Validation score: 0.826770
Iterat

MLPClassifier(alpha=1e-05, early_stopping=True,
              hidden_layer_sizes=(128, 64, 32), max_iter=150, verbose=1)

In [7]:
# Deney setinde %97, test setinde %90'a yakin basari
print("Train acc:\t{0:.3f}".format(clf.score(X_train, y_train)))
print("Test acc:\t{0:.3f}".format(clf.score(X_test, y_test)))

print(classification_report(y_train, clf.predict(X_train)))
print(classification_report(y_test, clf.predict(X_test)))

Train acc:	0.967
Test acc:	0.892
              precision    recall  f1-score   support

       False       0.97      0.96      0.97     21310
        True       0.96      0.97      0.97     21348

    accuracy                           0.97     42658
   macro avg       0.97      0.97      0.97     42658
weighted avg       0.97      0.97      0.97     42658

              precision    recall  f1-score   support

       False       0.92      0.86      0.89      7129
        True       0.87      0.92      0.90      7091

    accuracy                           0.89     14220
   macro avg       0.89      0.89      0.89     14220
weighted avg       0.89      0.89      0.89     14220



In [8]:
# Prepare final model
clf = MLPClassifier(
    hidden_layer_sizes=(128, 64, 32), max_iter=150, activation='relu', 
    solver='adam', alpha=1e-5, early_stopping=True)
clf.fit(bert_data, labels)
print("Train acc:\t{0:.3f}".format(clf.score(bert_data, labels)))
print(classification_report(y_train, clf.predict(X_train)))

Train acc:	0.967
              precision    recall  f1-score   support

       False       0.98      0.96      0.97     21310
        True       0.96      0.98      0.97     21348

    accuracy                           0.97     42658
   macro avg       0.97      0.97      0.97     42658
weighted avg       0.97      0.97      0.97     42658



In [9]:
# Pre-trained BERT kullanarak vektore cevirme
# Tek seferlik bir yukleme yapicak
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-128k-uncased")
bert = AutoModel.from_pretrained("dbmdz/bert-base-turkish-128k-uncased")

In [10]:
# Cumleleri vektore cevirirken eksik kisimlarda maximum uzunluga gore padding yapilmistir
# Max uzunluk sinkaf datasi icin 113
MAX_SENTENCE_TOKEN_LENGTH = 113

In [16]:
# Offensive? - Kufur mu?

test = [
    "lanet olasica pislik", 
    "tanisalim mi tatlim", 
    "yaz transfer sezonuna lionel messi damga vuracak gibi gözüküyor", 
    "dal sarkar kartal kalkar",
    "amk cocugu",
    "aq bebesindeki havaya bak sen",
    "kralsin cocuk bizim alper",
    "erol bulut istifa",
    "sen kendini ne saniyorsun kopek"]

In [17]:
tokenized = [tokenizer.encode(s, add_special_tokens=True) for s in test]
padded = np.array([s + [0]*(MAX_SENTENCE_TOKEN_LENGTH-len(s)) for s in tokenized])
input_ids = torch.tensor(np.array(padded)).to(torch.int64)

In [18]:
# Cumleleri vektore cevirme
def sentence_2_vec(input_id):
    with torch.no_grad():
        last_hidden_states = bert(input_id)
        features = last_hidden_states[0][:,0,:].numpy()
    return features

In [19]:
test_vector = sentence_2_vec(input_ids)
clf.predict(test_vector)

array([ True, False, False, False,  True,  True, False, False,  True])

In [15]:
import joblib

#dump it (higher), dump it (higher!)
joblib.dump(clf, "sinkaf/data/clf_nn.joblib")

['sinkaf/data/clf_nn.joblib']