In [1]:
import numpy as np
import pandas as pd
import re
import string
import torch

from imblearn.over_sampling import SMOTE
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, AutoModel, pipeline

In [2]:
labels = pd.read_csv("sinkaf/data/troff-v1.0.tsv",  sep='\t')['label'] != 'non'
print(labels.value_counts())

False    28439
True      6845
Name: label, dtype: int64


In [3]:
# Onceden egitilmis Turkce BERT modeli kullanilarak deney derlemi vektore cevirilmistir
# https://drive.google.com/file/d/1fq_Vkvg0QFpZaG1EgwdhyXYNSLqdu2tq/view?usp=sharing
# Dosyayi indirip sinkaf/data'ya kopyalayiniz 
bert_data = pd.read_csv("sinkaf/data/bert_data.csv", header=None)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(bert_data, labels)

In [5]:
# Az olan siniftaki degerlerden ornek uretimi
smote = SMOTE(sampling_strategy=1)
X_train, y_train = smote.fit_sample(bert_data, labels)

In [6]:
# basit NN egitimi
clf = MLPClassifier(
    hidden_layer_sizes=(128, 64, 32), max_iter=150, activation='relu', 
    solver='adam', alpha=1e-5, verbose=1, early_stopping=True)
clf.fit(X_train, y_train)

Iteration 1, loss = 0.52592618
Validation score: 0.777426
Iteration 2, loss = 0.45636663
Validation score: 0.799930
Iteration 3, loss = 0.43379402
Validation score: 0.806435
Iteration 4, loss = 0.41607332
Validation score: 0.789557
Iteration 5, loss = 0.40778712
Validation score: 0.814170
Iteration 6, loss = 0.39313461
Validation score: 0.827532
Iteration 7, loss = 0.38391137
Validation score: 0.829114
Iteration 8, loss = 0.37975719
Validation score: 0.826653
Iteration 9, loss = 0.36511844
Validation score: 0.823840
Iteration 10, loss = 0.36128970
Validation score: 0.826477
Iteration 11, loss = 0.35013359
Validation score: 0.839838
Iteration 12, loss = 0.33610951
Validation score: 0.838080
Iteration 13, loss = 0.33081634
Validation score: 0.841772
Iteration 14, loss = 0.32964849
Validation score: 0.835091
Iteration 15, loss = 0.32088277
Validation score: 0.819972
Iteration 16, loss = 0.31065749
Validation score: 0.845640
Iteration 17, loss = 0.29909437
Validation score: 0.848980
Iterat

MLPClassifier(alpha=1e-05, early_stopping=True,
              hidden_layer_sizes=(128, 64, 32), max_iter=150, verbose=1)

In [7]:
# Deney setinde %97, test setinde %95'e yakin basari
print("Train acc:\t{0:.3f}".format(clf.score(X_train, y_train)))
print("Test acc:\t{0:.3f}".format(clf.score(X_test, y_test)))

print(classification_report(y_train, clf.predict(X_train)))
print(classification_report(y_test, clf.predict(X_test)))

Train acc:	0.967
Test acc:	0.949
              precision    recall  f1-score   support

       False       0.98      0.95      0.97     28439
        True       0.95      0.98      0.97     28439

    accuracy                           0.97     56878
   macro avg       0.97      0.97      0.97     56878
weighted avg       0.97      0.97      0.97     56878

              precision    recall  f1-score   support

       False       0.98      0.95      0.97      7106
        True       0.82      0.94      0.88      1715

    accuracy                           0.95      8821
   macro avg       0.90      0.95      0.92      8821
weighted avg       0.95      0.95      0.95      8821



In [8]:
# Pre-trained BERT kullanarak vektore cevirme
# Tek seferlik bir yukleme yapicak
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-128k-uncased")
bert = AutoModel.from_pretrained("dbmdz/bert-base-turkish-128k-uncased")

In [9]:
# Cumleleri vektore cevirirken eksik kisimlarda maximum uzunluga gore padding yapilmistir
# Max uzunluk sinkaf datasi icin 113
MAX_SENTENCE_TOKEN_LENGTH = 113

In [10]:
# Offensive? - Kufur mu?

test = [
    "lanet olasica pislik", 
    "tanisalim mi tatlim", 
    "yaz transfer sezonuna lionel messi damga vuracak gibi gözüküyor", 
    "dal sarkar kartal kalkar",
    "amk cocugu",
    "aq bebesindeki havaya bak sen",
    "kral cocuk bizim alper",
    "erol bulut istifa",
    "sen kendini ne saniyorsun kopek"]

In [11]:
tokenized = [tokenizer.encode(s, add_special_tokens=True) for s in test]
padded = np.array([s + [0]*(MAX_SENTENCE_TOKEN_LENGTH-len(s)) for s in tokenized])
input_ids = torch.tensor(np.array(padded)).to(torch.int64)

In [12]:
# Cumleleri vektore cevirme
def sentence_2_vec(input_id):
    with torch.no_grad():
        last_hidden_states = bert(input_id)
        features = last_hidden_states[0][:,0,:].numpy()
    return features

In [13]:
test_vector = sentence_2_vec(input_ids)
clf.predict(test_vector)

array([ True, False, False, False,  True,  True, False, False,  True])

In [14]:
import joblib

#dump it (higher), dump it (higher!)
joblib.dump(clf, "sinkaf/data/clf_nn.joblib")

['sinkaf/data/clf_nn.joblib']