In [1]:
import pandas as pd
import numpy as np
import re
import string
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("sinkaf/data/troff-v1.0.tsv",  sep='\t')

# punctuation table
table = str.maketrans('', '', string.punctuation)

def lower(s):
    return s.lower()

def removeUserNames(s):
    return re.sub('@[^\s]+','',s)

def removeNumbers(s):
    return re.sub('[0-9]','',s)

def removePunctuation(s):
    res = [w.translate(table) for w in s.split()]
    return " ".join(res)

def nStemmer(s, n):
    if(n>0):
        res = [x[:n] for x in s.split()]
        return " ".join(res)
    raise Exception("n must be a positive integer!")


def preProcess(s, n=5):
    return nStemmer(removePunctuation(removeNumbers(removeUserNames(lower(s)))),n)

data['text'] = data['text'].apply(lambda x: preProcess(x, 10))

In [3]:
# create labels vector for training
# modeli egitmek icin siniflandirmalari olustur
data['label_sinkaf'] = data['label'] != 'non'

In [4]:
# 6.8K offensive - kufurlu yorum
# 28K  non-offensive - kufursuz yorum
data['label_sinkaf'].value_counts()

False    28439
True      6845
Name: label_sinkaf, dtype: int64

In [5]:
stop_words_tr = pd.read_csv("https://raw.githubusercontent.com/ahmetax/trstop/master/dosyalar/turkce-stop-words", header=None)

stop_words_tr = stop_words_tr[0].to_numpy()

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV


In [7]:
vectorizer = TfidfVectorizer(stop_words=frozenset(stop_words_tr), min_df=2)


X = data['text']
y = data['label_sinkaf']

X_train, X_test, y_train, y_test = train_test_split(X,y)

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

print(len(vectorizer.get_feature_names()))

26740


In [8]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy=1)
X_sm, y_sm = smote.fit_sample(X_train, y_train)

y_sm.value_counts()

False    21354
True     21354
Name: label_sinkaf, dtype: int64

In [9]:
model = LinearSVC(class_weight="balanced", tol=1e-2, max_iter=1e5)
model = MultinomialNB()
cclf = CalibratedClassifierCV(base_estimator=model)
cclf.fit(X_sm, y_sm)

CalibratedClassifierCV(base_estimator=MultinomialNB())

In [10]:

print(cclf.get_params())
print("Train acc:\t{0:.3f}".format(cclf.score(X_train, y_train)))
print("Test acc:\t{0:.3f}".format(cclf.score(X_test, y_test)))

print(classification_report(y_train, cclf.predict(X_train)))
print(classification_report(y_test, cclf.predict(X_test)))


{'base_estimator__alpha': 1.0, 'base_estimator__class_prior': None, 'base_estimator__fit_prior': True, 'base_estimator': MultinomialNB(), 'cv': None, 'ensemble': True, 'method': 'sigmoid', 'n_jobs': None}
Train acc:	0.906
Test acc:	0.790
              precision    recall  f1-score   support

       False       0.98      0.91      0.94     21354
        True       0.70      0.91      0.79      5109

    accuracy                           0.91     26463
   macro avg       0.84      0.91      0.86     26463
weighted avg       0.92      0.91      0.91     26463

              precision    recall  f1-score   support

       False       0.90      0.83      0.86      7085
        True       0.47      0.61      0.53      1736

    accuracy                           0.79      8821
   macro avg       0.69      0.72      0.70      8821
weighted avg       0.81      0.79      0.80      8821



In [28]:
# Offensive? - Kufur mu?

test = ["cok iyi", 
        "bi git", 
        "bi siktir git", 
        "bi defol",
        "mukemmel bir insansin"]

test_processed = vectorizer.transform([preProcess(p, 10) for p in test])
cclf.predict(test_processed)

array([ True,  True])

In [12]:
np.set_printoptions(suppress=True)

cclf.predict_proba(test_processed)[:,1]

array([0.0353452 , 0.94855021, 0.99565792, 0.98383419, 0.06143363])

In [13]:
import joblib

#dump it (higher), dump it (higher!)
joblib.dump(vectorizer, "sinkaf/data/vectorizer.joblib")
joblib.dump(cclf, "sinkaf/data/model.joblib")


['sinkaf/data/model.joblib']