In [26]:
# -*-coding:utf-8-*-
import os

import numpy as np
from sklearn.cross_validation import KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import confusion_matrix

import bald_latin

### Load the original and stemmed comments, and their labels.
Remove the cyrillic comments and remove accents from ć,č,ž,š,đ.

In [2]:
originals = open('dataset/lns_comments.txt', 'r').readlines()
corpus = open('dataset/lns_stemmed.txt', 'r').readlines()
labels = open('dataset/lns_labels.txt', 'r').readlines()

# remove cyrillic and balden text
corpus, labels = bald_latin.remove_cyrillic_comments(corpus, labels)
originals, _ = bald_latin.remove_cyrillic_comments(originals, range(len(originals)))
corpus = bald_latin.remove_serbian_accents(corpus)

# labels as a numpy array
labels = np.array([int(float(x)) for x in labels])

Cyrillic comments make up 1.24624624625 percent
Cyrillic comments make up 1.24643339841 percent


## Vectorize each comment.
Remove stop words and perform TF-IDF on the comments. Convert the sparse matrix to a dense one.

In [3]:
croatian_stop_words = set([u"a",u"ako",u"ali",u"bi",u"bih",u"bila",u"bili",u"bilo",u"bio",u"bismo",u"biste",u"biti",u"bumo",u"da",u"do",u"duž",u"ga",u"hoće",u"hoćemo",u"hoćete",u"hoćeš",u"hoću",u"i",u"iako",u"ih",u"ili",u"iz",u"ja",u"je",u"jedna",u"jedne",u"jedno",u"jer",u"jesam",u"jesi",u"jesmo",u"jest",u"jeste",u"jesu",u"jim",u"joj",u"još",u"ju",u"kada",u"kako",u"kao",u"koja",u"koje",u"koji",u"kojima",u"koju",u"kroz",u"li",u"me",u"mene",u"meni",u"mi",u"mimo",u"moj",u"moja",u"moje",u"mu",u"na",u"nad",u"nakon",u"nam",u"nama",u"nas",u"naš",u"naša",u"naše",u"našeg",u"ne",u"nego",u"neka",u"neki",u"nekog",u"neku",u"nema",u"netko",u"neće",u"nećemo",u"nećete",u"nećeš",u"neću",u"nešto",u"ni",u"nije",u"nikoga",u"nikoje",u"nikoju",u"nisam",u"nisi",u"nismo",u"niste",u"nisu",u"njega",u"njegov",u"njegova",u"njegovo",u"njemu",u"njezin",u"njezina",u"njezino",u"njih",u"njihov",u"njihova",u"njihovo",u"njim",u"njima",u"njoj",u"nju",u"no",u"o",u"od",u"odmah",u"on",u"ona",u"oni",u"ono",u"ova",u"pa",u"pak",u"po",u"pod",u"pored",u"prije",u"s",u"sa",u"sam",u"samo",u"se",u"sebe",u"sebi",u"si",u"smo",u"ste",u"su",u"sve",u"svi",u"svog",u"svoj",u"svoja",u"svoje",u"svom",u"ta",u"tada",u"taj",u"tako",u"te",u"tebe",u"tebi",u"ti",u"to",u"toj",u"tome",u"tu",u"tvoj",u"tvoja",u"tvoje",u"u",u"uz",u"vam",u"vama",u"vas",u"vaš",u"vaša",u"vaše",u"već",u"vi",u"vrlo",u"za",u"zar",u"će",u"ćemo",u"ćete",u"ćeš",u"ću",u"što"])

# build tf-idf vectorizer which uses unigrams and bigrams.
# uses words with 2+ occurances as features
vectorizer = TfidfVectorizer(
    strip_accents="unicode",
    lowercase=True,
    ngram_range=(1, 2),
    min_df=10,
    norm='l2',
    smooth_idf=True,
    use_idf=True,
    stop_words=croatian_stop_words)

# vectorize the text, convert to dense matrix
X = vectorizer.fit_transform(corpus).todense()
y = labels

# Classify comments in the test set.
### Only classify comments with a high classification certainty. (greater than _cutoff_)

In [4]:
cutoff = 0.8
lower = 0.0
upper = 0.9

k_fold = KFold(X.shape[0], n_folds=10, shuffle=True)
for fold_count, (train, test) in enumerate(k_fold):
    X_train = X[train]
    y_train = labels[train]
    X_test = X[test] 
    y_test = labels[test]
    # original comments for the test set
    comments_test = [originals[x] for x in test]
    
    # create and fit the classifier
    clf = MultinomialNB().fit(X_train, y_train)
    
    # predict probabilities on the training set
    proba = clf.predict_proba(X_train)[:, 1]
    training_error = np.mean(np.round(proba) == np.round(y_train)) * 100
    # predict probabilities on the test set
    proba = clf.predict_proba(X_test)[:, 1]
    
    # get the classification error
#     indices = np.argwhere(np.logical_or(proba >= cutoff, proba <= 1 - cutoff))
    indices = np.argwhere(np.logical_or(proba >= upper, proba <= lower))
    y_pred = np.round(proba)
    count = np.sum(indices)
    print "\nFold #{}, classifying {} comments".format(fold_count, len(y_test))
    
#     print confusion_matrix(y_test, y_pred)
    print confusion_matrix(y_test[indices], y_pred[indices])
    
    
#     print "Classification accuracy: {:.2f}%, comments classified: {} ({:.2f}% overall)\n".format(accuracy, count, 100.0*count/len(y_test))
    
    # print out misclassified comments
#     for prob, label, text in zip(proba, y_test, originals):
#         # only take into account comments with high classification certainty.
#         # probability must fall in range [0, 1-cutoff] or [cutoff, 1] for the classification to be taken into account.
#         if prob >= cutoff or prob <= 1 - cutoff:
#             # if the comment is misclassified, print it
#             if np.round(prob) != label:
#                 cls = "bot" if prob > 0.5 else "not"
#                 real = "bot" if label > 0.5 else "not"
                
#                 print "Classified as '%s', label is '%s', comment: \n%s" % (cls, real, text)

    



Fold #0, classifying 658 comments
[[0 1]
 [0 5]]

Fold #1, classifying 658 comments
[[0 1]
 [0 7]]

Fold #2, classifying 658 comments
[[10]]

Fold #3, classifying 658 comments
[[6]]

Fold #4, classifying 658 comments
[[10]]

Fold #5, classifying 658 comments
[[10]]

Fold #6, classifying 657 comments
[[3]]

Fold #7, classifying 657 comments
[[4]]

Fold #8, classifying 657 comments
[[5]]

Fold #9, classifying 657 comments
[[13]]


In [5]:
max_scraped = 10**4
original_scraped = open('dataset/comments.txt', 'r').readlines()
scraped = open('dataset/stemmed.txt', 'r').readlines()
scraped, original_scraped = bald_latin.remove_cyrillic_comments(scraped, original_scraped)
scraped = bald_latin.remove_serbian_accents(scraped)

Cyrillic comments make up 2.22219306384 percent


In [31]:
clf = MultinomialNB().fit(X, labels)

import codecs
output = codecs.open('bot_clf.txt', 'w', 'utf-8')

threshold = 0.85

assert len(scraped) == len(original_scraped)
for comment, org in zip(scraped, original_scraped):
    X_next = vectorizer.transform([comment])
    y_next = clf.predict_proba(X_next)[0][1]
    
    if y_next > threshold:
        org = org.decode('utf-8')
        org = org.replace(u'\n', u' ')
        output.write(org + u'\n')
        
output.close()

In [29]:
output.close()