In [1]:
import numpy as np
import pandas as pd
import re
import string
import torch

from imblearn.over_sampling import SMOTE
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, AutoModel, pipeline

In [2]:
labels = pd.read_csv("sinkaf/data/troff-v1.0.tsv",  sep='\t')['label'] != 'non'
print(labels.value_counts())

False    28439
True      6845
Name: label, dtype: int64


In [3]:
# Onceden egitilmis Turkce BERT modeli kullanilarak deney derlemi vektore cevirilmistir
# https://drive.google.com/file/d/1fq_Vkvg0QFpZaG1EgwdhyXYNSLqdu2tq/view?usp=sharing
# Dosyayi indirip sinkaf/data'ya kopyalayiniz 
bert_data = pd.read_csv("sinkaf/data/data_bert.csv", header=None)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(bert_data, labels)

In [5]:
# Az olan siniftaki degerlerden ornek uretimi
smote = SMOTE(sampling_strategy=1)
X_train, y_train = smote.fit_sample(X_train, y_train)

In [6]:
# basit NN egitimi
clf = MLPClassifier(
    hidden_layer_sizes=(128, 64, 32), max_iter=150, activation='relu', 
    solver='adam', alpha=1e-5, verbose=1, early_stopping=True)
clf.fit(X_train, y_train)

Iteration 1, loss = 0.53874775
Validation score: 0.777908
Iteration 2, loss = 0.46866646
Validation score: 0.795065
Iteration 3, loss = 0.44252566
Validation score: 0.774853
Iteration 4, loss = 0.42749733
Validation score: 0.772503
Iteration 5, loss = 0.41610360
Validation score: 0.813161
Iteration 6, loss = 0.41124527
Validation score: 0.819506
Iteration 7, loss = 0.39319176
Validation score: 0.822797
Iteration 8, loss = 0.38711474
Validation score: 0.827497
Iteration 9, loss = 0.37291859
Validation score: 0.808696
Iteration 10, loss = 0.36831250
Validation score: 0.810106
Iteration 11, loss = 0.35941692
Validation score: 0.795770
Iteration 12, loss = 0.34912841
Validation score: 0.840658
Iteration 13, loss = 0.34058314
Validation score: 0.839248
Iteration 14, loss = 0.33321302
Validation score: 0.841833
Iteration 15, loss = 0.32220876
Validation score: 0.824912
Iteration 16, loss = 0.31229393
Validation score: 0.847709
Iteration 17, loss = 0.31007318
Validation score: 0.840188
Iterat

MLPClassifier(alpha=1e-05, early_stopping=True,
              hidden_layer_sizes=(128, 64, 32), max_iter=150, verbose=1)

In [7]:
# Deney setinde %97, test setinde %90'a yakin basari
print("Train acc:\t{0:.3f}".format(clf.score(X_train, y_train)))
print("Test acc:\t{0:.3f}".format(clf.score(X_test, y_test)))

print(classification_report(y_train, clf.predict(X_train)))
print(classification_report(y_test, clf.predict(X_test)))

Train acc:	0.976
Test acc:	0.818
              precision    recall  f1-score   support

       False       0.97      0.98      0.98     21272
        True       0.98      0.97      0.98     21272

    accuracy                           0.98     42544
   macro avg       0.98      0.98      0.98     42544
weighted avg       0.98      0.98      0.98     42544

              precision    recall  f1-score   support

       False       0.89      0.89      0.89      7167
        True       0.51      0.51      0.51      1654

    accuracy                           0.82      8821
   macro avg       0.70      0.70      0.70      8821
weighted avg       0.82      0.82      0.82      8821



In [8]:
# Prepare final model
clf = MLPClassifier(
    hidden_layer_sizes=(128, 64, 32), max_iter=150, activation='relu', 
    solver='adam', alpha=1e-5, early_stopping=True)
clf.fit(bert_data, labels)
print("Train acc:\t{0:.3f}".format(clf.score(bert_data, labels)))
print(classification_report(y_train, clf.predict(X_train)))

Train acc:	0.857
              precision    recall  f1-score   support

       False       0.60      0.98      0.75     21272
        True       0.94      0.36      0.52     21272

    accuracy                           0.67     42544
   macro avg       0.77      0.67      0.63     42544
weighted avg       0.77      0.67      0.63     42544



In [9]:
# Pre-trained BERT kullanarak vektore cevirme
# Tek seferlik bir yukleme yapicak
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-128k-uncased")
bert = AutoModel.from_pretrained("dbmdz/bert-base-turkish-128k-uncased")

Downloading: 100%|██████████| 386/386 [00:00<00:00, 137kB/s]
Downloading: 100%|██████████| 1.23M/1.23M [00:01<00:00, 1.18MB/s]
Downloading: 100%|██████████| 59.0/59.0 [00:00<00:00, 8.80kB/s]
Downloading: 100%|██████████| 740M/740M [04:22<00:00, 2.82MB/s]


In [10]:
# Cumleleri vektore cevirirken eksik kisimlarda maximum uzunluga gore padding yapilmistir
# Max uzunluk sinkaf datasi icin 113
MAX_SENTENCE_TOKEN_LENGTH = 113

In [11]:
# Offensive? - Kufur mu?

test = [
    "lanet olasica pislik", 
    "tanisalim mi tatlim", 
    "yaz transfer sezonuna lionel messi damga vuracak gibi gözüküyor", 
    "dal sarkar kartal kalkar",
    "amk cocugu",
    "aq bebesindeki havaya bak sen",
    "kralsin cocuk bizim alper",
    "erol bulut istifa",
    "sen kendini ne saniyorsun kopek"]

In [12]:
tokenized = [tokenizer.encode(s, add_special_tokens=True) for s in test]
padded = np.array([s + [0]*(MAX_SENTENCE_TOKEN_LENGTH-len(s)) for s in tokenized])
input_ids = torch.tensor(np.array(padded)).to(torch.int64)

In [13]:
# Cumleleri vektore cevirme
def sentence_2_vec(input_id):
    with torch.no_grad():
        last_hidden_states = bert(input_id)
        features = last_hidden_states[0][:,0,:].numpy()
    return features

In [14]:
test_vector = sentence_2_vec(input_ids)
clf.predict(test_vector)

array([ True, False, False, False,  True,  True, False, False, False])

In [15]:
import joblib

#dump it (higher), dump it (higher!)
joblib.dump(clf, "sinkaf/data/clf_nn.joblib")

['sinkaf/data/clf_nn.joblib']