In [None]:
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer

from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction import DictVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import LogisticRegression

from sklearn.svm import SVC

from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import make_scorer

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer

import sys
stdout = sys.stdout
reload(sys)
sys.setdefaultencoding('utf-8')
sys.stdout = stdout

In [3]:
read_json = lambda x : pd.read_json(x, lines = True)

In [4]:
train_X = read_json("train_X_languages_homework.json.txt").text
train_y = read_json("train_y_languages_homework.json.txt").classification
test_X = read_json("test_X_languages_homework.json.txt").text
labels = train_y.unique().tolist()
pd_data = pd.concat([train_X, train_y], axis = 1)
print pd_data.size, pd_data.size
pd_data = pd_data.drop_duplicates()
train_X = pd_data.text.reset_index(drop=True)
train_y = pd_data.classification.reset_index(drop=True)

250782 250782


In [None]:
train_X.iloc[122765]

In [None]:
pd_data = pd.concat([train_X, train_y], axis = 1)
groupby_class = pd_data["text"].groupby(pd_data['classification'])

In [None]:
desc = groupby_class.describe()

In [None]:
desc[["count", "unique"]].plot.bar(figsize = (20, 20), fontsize = 20)

In [None]:
dup_counts = (desc["count"] - 1.0 * desc["unique"])
frac_dup = dup_counts/ desc["count"]
frac_dup.plot.bar(figsize = (20, 20), fontsize = 20)
# Overall data duplication percentage
print dup_counts.sum()/ desc["count"].sum()

In [None]:
avg_tokens_per_sent = groupby_class.agg({'text' : lambda x : sum(len(word_tokenize(y))*1.0/len(x) for y in x)})

In [None]:
avg_tokens_per_sent.plot.bar(figsize = (20, 20), fontsize = 20)

In [5]:
class TextTokenizer(BaseEstimator, TransformerMixin):
    def fit(self, x, y = None):
        return self
    
    def transform(self, sentences):
        features = []
        tokenizer = RegexpTokenizer('\w+')
        for sentence in sentences:
            features.append(" ".join(tokenizer.tokenize(sentence)))
        return features

class TextStats(BaseEstimator, TransformerMixin):
    def fit(self, x, y = None):
        return self
    def transform(self, sent_tokens):
        return [{'num_words': sent_toks.count(' ') + 1 } 
                for sent_toks in sent_tokens]

In [25]:
from tempfile import mkdtemp
from shutil import rmtree
from sklearn.externals.joblib import Memory
cachedir = mkdtemp()
memory = Memory(cachedir=cachedir, verbose = 0)
kappa_scorer = make_scorer(cohen_kappa_score)
# detection_pipeline = Pipeline([
#     ('tokenizer', TextTokenizer()),
#     ('vect', TfidfVectorizer(ngram_range=(1,6), analyzer = 'char_wb')),
#     ('features', SelectKBest(score_func = chi2)),
#     ('clf', LogisticRegression(C = 21.5443469, 
#                                  solver = 'sag', 
#                                  multi_class = 'multinomial',
#                                  verbose = 1))
# ], memory = memory)

detection_pipeline = Pipeline([
    ('tokens', TextTokenizer()),
    
    ('features', FeatureUnion([
        ('vect', Pipeline([
            ('algo', CountVectorizer(ngram_range = (4, 6), analyzer='char_wb', token_pattern=r'\s*(.*?)\s',  max_features= 40000)),
            ('scale', Normalizer())
        ])),
        ('text_stats', Pipeline([
            ('stats', TextStats()),
            ('vect', DictVectorizer()),
            ('scale', Normalizer())
        ])),
    ])),
    ('clf', None),
], memory = memory)

In [None]:
kappa_scorer = make_scorer(cohen_kappa_score)
clf = GridSearchCV(estimator=detection_pipeline,
                  param_grid = {
#                       'features__vect__algo)' : (
# #                                    CountVectorizer(ngram_range = (4, 5), analyzer='char_wb'),
#                                    CountVectorizer(ngram_range = (1, 6), analyzer='char_wb', token_pattern=r'\s*(.*?)\s'),
#                                ),
                      'clf' : (
                                   SGDClassifier(loss = 'log',
#                                       multi_class= 'multinomial',
                                      tol = 1e-4,
                                      max_iter = 100,
                                      verbose = 1),
                              ),
                      'clf__class_weight': ('balanced',),
#                       'clf__solver': ('sag',),
#                       'clf__C': (1.0,),
#                       'features__vect__algo__max_features': (40000,),
                  },
                  cv = 5,
                  scoring = kappa_scorer,
                  return_train_score = True,
                  n_jobs = -1)
kf = StratifiedKFold(n_splits=2, shuffle = True, random_state = 5)
all_clfs = []
all_predicts = []
all_test = []
num = 1
for train_index, test_index in kf.split(train_X, train_y):
    X_train, X_test = train_X[train_index], train_X[test_index]
    y_train, y_test = train_y[train_index], train_y[test_index]
    print X_train.hasnans, y_train.hasnans
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print "=========== %d ==========" % num
    print "Cohen Kappa:", cohen_kappa_score(y_pred, y_test)
    print "Accuracy:", accuracy_score(y_test, y_pred)
    print classification_report(y_test, y_pred)
    print clf.best_params_
    print "========================="
    all_predicts.append(y_pred)
    all_test.append(y_test)
    all_clfs.append(clf)
    num += 1
    break
rmtree(cachedir)

False False
-- Epoch 1
Norm: 16.65, NNZs: 40001, Bias: -0.203309, T: 49097, Avg. loss: 0.084594
Total training time: 0.06 seconds.
-- Epoch 2
Norm: 16.62, NNZs: 40001, Bias: -0.231641, T: 98194, Avg. loss: 0.080215
-- Epoch 1
Total training time: 0.15 seconds.
-- Epoch 3
Norm: 16.71, NNZs: 40001, Bias: -0.201585, T: 49096, Avg. loss: 0.085473
Total training time: 0.04 seconds.
-- Epoch 2
Norm: 16.63, NNZs: 40001, Bias: -0.246690, T: 147291, Avg. loss: 0.080101
Total training time: 0.23 seconds.
-- Epoch 4
Norm: 16.69, NNZs: 40001, Bias: -0.228583, T: 98192, Avg. loss: 0.081503
Total training time: 0.12 seconds.
Norm: 16.62, NNZs: 40001, Bias: -0.256337, T: 196388, Avg. loss: 0.080172
-- Epoch 3
Total training time: 0.28 seconds.
Convergence after 4 epochs took 0.30 seconds
-- Epoch 1
Norm: 16.62, NNZs: 40001, Bias: -0.242740, T: 147288, Avg. loss: 0.081255
Total training time: 0.20 seconds.
Norm: 16.41, NNZs: 40001, Bias: -0.215766, T: 49097, Avg. loss: 0.057761
Total training time: 0.

Total training time: 0.37 seconds.
Convergence after 6 epochs took 0.37 seconds
-- Epoch 1
Norm: 16.13, NNZs: 40001, Bias: -0.211363, T: 49097, Avg. loss: 0.069349
Total training time: 0.05 seconds.
Norm: 18.45, NNZs: 40001, Bias: -0.303960, T: 98194, Avg. loss: 0.016330
Total training time: 0.13 seconds.
-- Epoch 2
-- Epoch 3
Norm: 17.78, NNZs: 40001, Bias: -0.252658, T: 49096, Avg. loss: 0.037225
Total training time: 0.06 seconds.
Norm: 15.95, NNZs: 40001, Bias: -0.237721, T: 98194, Avg. loss: 0.064285
Total training time: 0.10 seconds.
-- Epoch 2
Norm: 18.65, NNZs: 40001, Bias: -0.331834, T: 147291, Avg. loss: 0.016552
-- Epoch 3
Total training time: 0.18 seconds.
Convergence after 3 epochs took 0.18 seconds
-- Epoch 1
Norm: 16.92, NNZs: 40001, Bias: -0.276230, T: 98192, Avg. loss: 0.024586
Total training time: 0.12 seconds.
Norm: 15.93, NNZs: 40001, Bias: -0.252840, T: 147291, Avg. loss: 0.063943
Total training time: 0.16 seconds.
-- Epoch 4
Norm: 15.09, NNZs: 40001, Bias: -0.21527

Norm: 14.88, NNZs: 40001, Bias: -0.240410, T: 98194, Avg. loss: 0.066579
Total training time: 0.07 seconds.
-- Epoch 3
Total training time: 0.12 seconds.
-- Epoch 3
Norm: 14.09, NNZs: 40001, Bias: -0.212721, T: 49096, Avg. loss: 0.088965
Total training time: 0.08 seconds.
-- Epoch 2
Norm: 16.83, NNZs: 40001, Bias: -0.273802, T: 245485, Avg. loss: 0.053666
Total training time: 0.22 seconds.
Convergence after 5 epochs took 0.22 seconds
Norm: 15.89, NNZs: 40001, Bias: -0.255835, T: 147291, Avg. loss: 0.065360
Total training time: 0.11 seconds.
-- Epoch 1
-- Epoch 4
Norm: 14.89, NNZs: 40001, Bias: -0.256529, T: 147291, Avg. loss: 0.066385
Total training time: 0.17 seconds.
-- Epoch 4
Norm: 15.85, NNZs: 40001, Bias: -0.265593, T: 196388, Avg. loss: 0.065097
Norm: 13.96, NNZs: 40001, Bias: -0.237953, T: 98192, Avg. loss: 0.083055
Norm: 13.16, NNZs: 40001, Bias: -0.213422, T: 49097, Avg. loss: 0.114475
Total training time: 0.05 seconds.
Total training time: 0.16 seconds.
-- Epoch 2
Total trai

-- Epoch 5
-- Epoch 5
Total training time: 0.25 seconds.
-- Epoch 4
Norm: 17.01, NNZs: 40001, Bias: -0.257419, T: 147291, Avg. loss: 0.032667
Total training time: 0.33 seconds.
-- Epoch 4
Norm: 16.66, NNZs: 40001, Bias: -0.274381, T: 245485, Avg. loss: 0.055259
Total training time: 0.32 seconds.
Norm: 18.23, NNZs: 40001, Bias: -0.353356, T: 245485, Avg. loss: 0.016155
Total training time: 0.47 seconds.
-- Epoch 6
Convergence after 5 epochs took 0.47 seconds
Norm: 13.18, NNZs: 40001, Bias: -0.268327, T: 196384, Avg. loss: 0.109843
Total training time: 0.36 seconds.
Convergence after 4 epochs took 0.36 seconds
-- Epoch 1
Norm: 17.03, NNZs: 40001, Bias: -0.272130, T: 196388, Avg. loss: 0.032581
Total training time: 0.41 seconds.
-- Epoch 1
Convergence after 4 epochs took 0.41 seconds
-- Epoch 1
Norm: 16.62, NNZs: 40001, Bias: -0.280100, T: 294582, Avg. loss: 0.055315
Total training time: 0.39 seconds.
Norm: 15.00, NNZs: 40001, Bias: -0.221813, T: 49096, Avg. loss: 0.059343
Convergence aft

-- Epoch 1
-- Epoch 2
Convergence after 5 epochs took 0.27 seconds
-- Epoch 1
Norm: 14.49, NNZs: 40001, Bias: -0.253954, T: 147291, Avg. loss: 0.121852
Total training time: 0.20 seconds.
-- Epoch 4
Norm: 16.98, NNZs: 40001, Bias: -0.210637, T: 49097, Avg. loss: 0.059806
Norm: 17.31, NNZs: 40001, Bias: -0.221216, T: 49096, Avg. loss: 0.050001
Total training time: 0.05 seconds.
Norm: 15.22, NNZs: 40001, Bias: -0.236935, T: 98194, Avg. loss: 0.096532
Total training time: 0.04 seconds.
Total training time: 0.11 seconds.
-- Epoch 2
-- Epoch 3
-- Epoch 2
Norm: 14.47, NNZs: 40001, Bias: -0.264570, T: 196388, Avg. loss: 0.121909
Total training time: 0.24 seconds.
Convergence after 4 epochs took 0.25 seconds
Norm: 17.13, NNZs: 40001, Bias: -0.247655, T: 98192, Avg. loss: 0.044929
-- Epoch 1
Total training time: 0.08 seconds.
-- Epoch 3
Norm: 16.70, NNZs: 40001, Bias: -0.236249, T: 98194, Avg. loss: 0.055704
Total training time: 0.10 seconds.
-- Epoch 3
Norm: 15.21, NNZs: 40001, Bias: -0.253040,

Norm: 13.58, NNZs: 40001, Bias: -0.219265, T: 49096, Avg. loss: 0.069731
Convergence after 3 epochs took 0.20 seconds
Total training time: 0.04 seconds.
Total training time: 0.08 seconds.
-- Epoch 2
-- Epoch 1
-- Epoch 3
Norm: 13.74, NNZs: 40001, Bias: -0.245914, T: 98192, Avg. loss: 0.065023
Total training time: 0.09 seconds.
-- Epoch 3
Norm: 13.83, NNZs: 40001, Bias: -0.219355, T: 49097, Avg. loss: 0.066466
Norm: 16.07, NNZs: 40001, Bias: -0.243742, T: 98194, Avg. loss: 0.045229
Total training time: 0.16 seconds.
-- Epoch 3
Total training time: 0.04 seconds.
Norm: 14.43, NNZs: 40001, Bias: -0.255474, T: 147291, Avg. loss: 0.120818
-- Epoch 2
Total training time: 0.17 seconds.
Norm: 13.78, NNZs: 40001, Bias: -0.262833, T: 147288, Avg. loss: 0.064319
-- Epoch 4
Total training time: 0.15 seconds.
-- Epoch 4
Norm: 13.85, NNZs: 40001, Bias: -0.246113, T: 98194, Avg. loss: 0.060060
Norm: 16.02, NNZs: 40001, Bias: -0.256851, T: 147291, Avg. loss: 0.044772
Total training time: 0.24 seconds.


Total training time: 0.20 seconds.
-- Epoch 4
-- Epoch 6
Norm: 16.36, NNZs: 40001, Bias: -0.268039, T: 196388, Avg. loss: 0.047445
Total training time: 0.29 seconds.
Norm: 16.01, NNZs: 40001, Bias: -0.240770, T: 98194, Avg. loss: 0.045302
Total training time: 0.15 seconds.
-- Epoch 3
Norm: 13.79, NNZs: 40001, Bias: -0.286830, T: 294582, Avg. loss: 0.065423
-- Epoch 5
Norm: 13.79, NNZs: 40001, Bias: -0.270911, T: 196384, Avg. loss: 0.058909
Total training time: 0.27 seconds.
Convergence after 6 epochs took 0.30 seconds
Total training time: 0.28 seconds.
Convergence after 4 epochs took 0.28 seconds
-- Epoch 1
Norm: 16.00, NNZs: 40001, Bias: -0.257261, T: 147291, Avg. loss: 0.044890
-- Epoch 1
Total training time: 0.21 seconds.
-- Epoch 4
Norm: 16.40, NNZs: 40001, Bias: -0.278963, T: 245485, Avg. loss: 0.047513
Total training time: 0.38 seconds.
Convergence after 5 epochs took 0.39 seconds
Norm: 15.69, NNZs: 40001, Bias: -0.210239, T: 49096, Avg. loss: 0.070934
Norm: 16.49, NNZs: 40001, B

Convergence after 7 epochs took 0.63 seconds
Norm: 13.20, NNZs: 40001, Bias: -0.268292, T: 245485, Avg. loss: 0.054986
Total training time: 0.12 seconds.
Norm: 13.92, NNZs: 40001, Bias: -0.285727, T: 245485, Avg. loss: 0.064604
Total training time: 0.38 seconds.
-- Epoch 6
Total training time: 0.34 seconds.
-- Epoch 2
-- Epoch 6
-- Epoch 1
Norm: 13.85, NNZs: 40001, Bias: -0.290726, T: 294582, Avg. loss: 0.064509
Norm: 15.48, NNZs: 40001, Bias: -0.231021, T: 98192, Avg. loss: 0.116036
Total training time: 0.44 seconds.
Total training time: 0.20 seconds.
Convergence after 6 epochs took 0.45 seconds
-- Epoch 3
Norm: 13.24, NNZs: 40001, Bias: -0.276852, T: 294582, Avg. loss: 0.055022
Total training time: 0.43 seconds.
Convergence after 6 epochs took 0.44 seconds
Norm: 15.95, NNZs: 40001, Bias: -0.215740, T: 49097, Avg. loss: 0.070202
-- Epoch 1
Total training time: 0.10 seconds.
-- Epoch 2
-- Epoch 1
Norm: 15.75, NNZs: 40001, Bias: -0.241934, T: 98194, Avg. loss: 0.065457
Norm: 15.54, NNZs

-- Epoch 3
Norm: 12.17, NNZs: 40001, Bias: -0.216599, T: 49096, Avg. loss: 0.050697
Norm: 13.80, NNZs: 40001, Bias: -0.283762, T: 294582, Avg. loss: 0.057177
Total training time: 0.05 seconds.
Norm: 12.64, NNZs: 40001, Bias: -0.273425, T: 245485, Avg. loss: 0.048068
-- Epoch 2
-- Epoch 7
Total training time: 0.35 seconds.
Convergence after 5 epochs took 0.35 seconds
Norm: 15.53, NNZs: 40001, Bias: -0.249700, T: 147291, Avg. loss: 0.112882
Total training time: 0.42 seconds.
-- Epoch 1
Total training time: 0.21 seconds.
-- Epoch 4
Norm: 12.14, NNZs: 40001, Bias: -0.241366, T: 98192, Avg. loss: 0.046794
Total training time: 0.11 seconds.
-- Epoch 3
Norm: 13.81, NNZs: 40001, Bias: -0.290235, T: 343679, Avg. loss: 0.057164
Norm: 15.21, NNZs: 40001, Bias: -0.226177, T: 49097, Avg. loss: 0.051897
Total training time: 0.06 seconds.
Total training time: 0.47 seconds.
-- Epoch 2
Norm: 12.13, NNZs: 40001, Bias: -0.256813, T: 147288, Avg. loss: 0.046416
Norm: 15.46, NNZs: 40001, Bias: -0.258416, T

Total training time: 0.39 seconds.
Convergence after 5 epochs took 0.39 seconds
Total training time: 0.32 seconds.
-- Epoch 1
Convergence after 4 epochs took 0.34 seconds
Norm: 12.87, NNZs: 40001, Bias: -0.237628, T: 98194, Avg. loss: 0.048145
Total training time: 0.08 seconds.
-- Epoch 3
-- Epoch 1
Norm: 15.26, NNZs: 40001, Bias: -0.226946, T: 49097, Avg. loss: 0.045931
Total training time: 0.05 seconds.
-- Epoch 2
Norm: 12.90, NNZs: 40001, Bias: -0.251925, T: 147291, Avg. loss: 0.047955
Total training time: 0.12 seconds.
-- Epoch 4
Norm: 15.50, NNZs: 40001, Bias: -0.222098, T: 49096, Avg. loss: 0.044160
Norm: 15.58, NNZs: 40001, Bias: -0.206140, T: 49097, Avg. loss: 0.118320
Total training time: 0.06 seconds.
-- Epoch 2
Total training time: 0.07 seconds.
-- Epoch 2
Norm: 15.34, NNZs: 40001, Bias: -0.254454, T: 98194, Avg. loss: 0.040850
Total training time: 0.10 seconds.
-- Epoch 3
Norm: 12.84, NNZs: 40001, Bias: -0.261481, T: 196388, Avg. loss: 0.047731
Total training time: 0.17 sec

Total training time: 0.09 seconds.
-- Epoch 3
Norm: 13.40, NNZs: 40001, Bias: -0.259654, T: 147288, Avg. loss: 0.071421
Norm: 13.56, NNZs: 40001, Bias: -0.244906, T: 98194, Avg. loss: 0.071431
Total training time: 0.09 seconds.
-- Epoch 3
Norm: 16.66, NNZs: 40001, Bias: -0.256288, T: 147291, Avg. loss: 0.046866
Total training time: 0.13 seconds.
-- Epoch 4
Total training time: 0.15 seconds.
Norm: 17.15, NNZs: 40001, Bias: -0.271062, T: 196388, Avg. loss: 0.033583
-- Epoch 4
Total training time: 0.22 seconds.
Norm: 16.59, NNZs: 40001, Bias: -0.266596, T: 196388, Avg. loss: 0.046649
Norm: 13.52, NNZs: 40001, Bias: -0.259576, T: 147291, Avg. loss: 0.070702
Total training time: 0.17 seconds.
-- Epoch 5
Convergence after 4 epochs took 0.24 seconds
Total training time: 0.14 seconds.
Norm: 13.45, NNZs: 40001, Bias: -0.272868, T: 196384, Avg. loss: 0.071026
Total training time: 0.19 seconds.
-- Epoch 4
-- Epoch 5
-- Epoch 1
Norm: 16.61, NNZs: 40001, Bias: -0.276459, T: 245485, Avg. loss: 0.046

-- Epoch 3
Norm: 14.67, NNZs: 40001, Bias: -0.215949, T: 49097, Avg. loss: 0.088311
Total training time: 0.05 seconds.
-- Epoch 2
Norm: 14.63, NNZs: 40001, Bias: -0.251904, T: 147288, Avg. loss: 0.082648
Total training time: 0.13 seconds.
-- Epoch 4
Norm: 16.96, NNZs: 40001, Bias: -0.216686, T: 49097, Avg. loss: 0.051228
Total training time: 0.06 seconds.
-- Epoch 2
Norm: 14.56, NNZs: 40001, Bias: -0.240224, T: 98194, Avg. loss: 0.082872
Total training time: 0.10 seconds.
-- Epoch 3
Norm: 14.69, NNZs: 40001, Bias: -0.265103, T: 196384, Avg. loss: 0.082553
Total training time: 0.16 seconds.
Convergence after 4 epochs took 0.17 seconds
-- Epoch 1
Norm: 13.53, NNZs: 40001, Bias: -0.259286, T: 147291, Avg. loss: 0.071124
Total training time: 0.22 seconds.
Norm: 14.56, NNZs: 40001, Bias: -0.256029, T: 147291, Avg. loss: 0.082264
Norm: 14.73, NNZs: 40001, Bias: -0.202143, T: 49096, Avg. loss: 0.107448
-- Epoch 4
Total training time: 0.04 seconds.
-- Epoch 2
Norm: 16.85, NNZs: 40001, Bias: -0

Total training time: 0.21 seconds.
Convergence after 5 epochs took 0.21 seconds
Norm: 15.64, NNZs: 40001, Bias: -0.276118, T: 196384, Avg. loss: 0.037873
-- Epoch 1
Total training time: 0.32 seconds.
-- Epoch 5
Norm: 14.57, NNZs: 40001, Bias: -0.269439, T: 196388, Avg. loss: 0.082039
Total training time: 0.23 seconds.
Convergence after 4 epochs took 0.23 seconds
Norm: 15.43, NNZs: 40001, Bias: -0.222934, T: 49097, Avg. loss: 0.056216
Norm: 15.95, NNZs: 40001, Bias: -0.284707, T: 245485, Avg. loss: 0.058459
Total training time: 0.04 seconds.
-- Epoch 1
Total training time: 0.37 seconds.
-- Epoch 2
Convergence after 5 epochs took 0.38 seconds
Norm: 15.55, NNZs: 40001, Bias: -0.281039, T: 245480, Avg. loss: 0.037719
Total training time: 0.38 seconds.
-- Epoch 6
-- Epoch 1
Norm: 14.64, NNZs: 40001, Bias: -0.200569, T: 49097, Avg. loss: 0.107271
Total training time: 0.04 seconds.
-- Epoch 2
Norm: 15.18, NNZs: 40001, Bias: -0.241057, T: 98194, Avg. loss: 0.050127
Total training time: 0.11 se

Total training time: 0.35 seconds.
Convergence after 6 epochs took 0.36 seconds
-- Epoch 4
Norm: 15.47, NNZs: 40001, Bias: -0.241354, T: 98194, Avg. loss: 0.058949
-- Epoch 1
Norm: 16.25, NNZs: 40001, Bias: -0.228003, T: 49097, Avg. loss: 0.045421
Total training time: 0.10 seconds.
Total training time: 0.04 seconds.
-- Epoch 3
-- Epoch 2
Norm: 14.33, NNZs: 40001, Bias: -0.215410, T: 49096, Avg. loss: 0.050421
Total training time: 0.05 seconds.
-- Epoch 2
Norm: 15.49, NNZs: 40001, Bias: -0.257477, T: 147291, Avg. loss: 0.058376
Total training time: 0.15 seconds.
-- Epoch 4
Norm: 13.45, NNZs: 40001, Bias: -0.262243, T: 196388, Avg. loss: 0.048630
Norm: 16.08, NNZs: 40001, Bias: -0.251323, T: 98194, Avg. loss: 0.038320
Total training time: 0.10 seconds.
Total training time: 0.29 seconds.
Norm: 14.00, NNZs: 40001, Bias: -0.244970, T: 98192, Avg. loss: 0.046230
-- Epoch 3
Convergence after 4 epochs took 0.30 seconds
Total training time: 0.09 seconds.
-- Epoch 3
Norm: 15.43, NNZs: 40001, Bia

-- Epoch 3
Norm: 17.84, NNZs: 40001, Bias: -0.271756, T: 147291, Avg. loss: 0.029655
Total training time: 0.19 seconds.
Convergence after 3 epochs took 0.19 seconds
Norm: 16.03, NNZs: 40001, Bias: -0.274359, T: 245485, Avg. loss: 0.052960
-- Epoch 1
Norm: 13.76, NNZs: 40001, Bias: -0.214050, T: 49097, Avg. loss: 0.083807
Total training time: 0.07 seconds.
Total training time: 0.36 seconds.
Convergence after 5 epochs took 0.37 seconds
-- Epoch 2
-- Epoch 1
Norm: 16.79, NNZs: 40001, Bias: -0.275941, T: 147288, Avg. loss: 0.024741
Total training time: 0.13 seconds.
Convergence after 3 epochs took 0.13 seconds
-- Epoch 1
Norm: 16.56, NNZs: 40001, Bias: -0.205478, T: 49097, Avg. loss: 0.079379
Total training time: 0.05 seconds.
-- Epoch 2
Norm: 13.74, NNZs: 40001, Bias: -0.238746, T: 98194, Avg. loss: 0.077589
Norm: 18.39, NNZs: 40001, Bias: -0.224453, T: 49096, Avg. loss: 0.036021
Total training time: 0.13 seconds.
Total training time: 0.04 seconds.
-- Epoch 3
-- Epoch 2
Norm: 15.21, NNZs:

-- Epoch 1
Norm: 15.35, NNZs: 40001, Bias: -0.275097, T: 294582, Avg. loss: 0.068320
Total training time: 0.35 seconds.
Convergence after 6 epochs took 0.35 seconds
Norm: 17.02, NNZs: 40001, Bias: -0.328272, T: 294582, Avg. loss: 0.018530
Total training time: 0.31 seconds.
-- Epoch 1
-- Epoch 7
Norm: 15.03, NNZs: 40001, Bias: -0.278391, T: 245485, Avg. loss: 0.049209
Total training time: 0.36 seconds.
-- Epoch 6
Norm: 16.95, NNZs: 40001, Bias: -0.330699, T: 343679, Avg. loss: 0.018455
Total training time: 0.34 seconds.
Convergence after 7 epochs took 0.35 seconds
Norm: 17.95, NNZs: 40001, Bias: -0.249828, T: 49096, Avg. loss: 0.026134
Total training time: 0.06 seconds.
-- Epoch 2
-- Epoch 1
Norm: 15.79, NNZs: 40001, Bias: -0.202627, T: 49097, Avg. loss: 0.091353
Total training time: 0.05 seconds.
-- Epoch 2
Norm: 17.43, NNZs: 40001, Bias: -0.281629, T: 98192, Avg. loss: 0.019836
Total training time: 0.10 seconds.
-- Epoch 3
Norm: 8.51, NNZs: 40001, Bias: -0.202449, T: 49097, Avg. loss:

-- Epoch 3
Total training time: 0.16 seconds.
-- Epoch 4
Norm: 16.73, NNZs: 40001, Bias: -0.284667, T: 147291, Avg. loss: 0.024513
Norm: 17.35, NNZs: 40001, Bias: -0.307401, T: 196388, Avg. loss: 0.023447
Total training time: 0.22 seconds.
Convergence after 4 epochs took 0.22 seconds
Total training time: 0.19 seconds.
-- Epoch 1
-- Epoch 4
Norm: 17.75, NNZs: 40001, Bias: -0.271007, T: 49097, Avg. loss: 0.030169
Norm: 16.59, NNZs: 40001, Bias: -0.293238, T: 196388, Avg. loss: 0.024347
Total training time: 0.05 seconds.
-- Epoch 2
Total training time: 0.25 seconds.
-- Epoch 5
Norm: 17.19, NNZs: 40001, Bias: -0.290027, T: 98194, Avg. loss: 0.019666
Total training time: 0.09 seconds.
-- Epoch 3
Norm: 16.54, NNZs: 40001, Bias: -0.301647, T: 245485, Avg. loss: 0.024358
Total training time: 0.29 seconds.
Convergence after 5 epochs took 0.30 seconds
-- Epoch 1
Norm: 17.14, NNZs: 40001, Bias: -0.309960, T: 147291, Avg. loss: 0.019346
Total training time: 0.13 seconds.
-- Epoch 4
Norm: 18.48, NN

-- Epoch 5
Norm: 15.90, NNZs: 40001, Bias: -0.273463, T: 245485, Avg. loss: 0.064247
Total training time: 0.14 seconds.
Convergence after 5 epochs took 0.14 seconds
-- Epoch 1
Norm: 14.22, NNZs: 40001, Bias: -0.221180, T: 49097, Avg. loss: 0.088308
Total training time: 0.02 seconds.
-- Epoch 2
Norm: 14.14, NNZs: 40001, Bias: -0.246897, T: 98194, Avg. loss: 0.082784
Total training time: 0.05 seconds.
-- Epoch 3
Norm: 14.11, NNZs: 40001, Bias: -0.263014, T: 147291, Avg. loss: 0.082012
Total training time: 0.07 seconds.
-- Epoch 4
Norm: 14.05, NNZs: 40001, Bias: -0.273374, T: 196388, Avg. loss: 0.081668
Total training time: 0.10 seconds.
-- Epoch 5
Norm: 14.10, NNZs: 40001, Bias: -0.283586, T: 245485, Avg. loss: 0.081694
Total training time: 0.12 seconds.
Convergence after 5 epochs took 0.12 seconds
-- Epoch 1
Norm: 17.02, NNZs: 40001, Bias: -0.245163, T: 49097, Avg. loss: 0.035533
Total training time: 0.02 seconds.
-- Epoch 2
Norm: 16.56, NNZs: 40001, Bias: -0.269391, T: 98194, Avg. loss

Total training time: 0.05 seconds.
-- Epoch 3
Norm: 17.05, NNZs: 40001, Bias: -0.256725, T: 147291, Avg. loss: 0.043688
Total training time: 0.08 seconds.
-- Epoch 4
Norm: 17.07, NNZs: 40001, Bias: -0.270805, T: 196388, Avg. loss: 0.043630
Total training time: 0.10 seconds.
Convergence after 4 epochs took 0.11 seconds
-- Epoch 1
Norm: 15.51, NNZs: 40001, Bias: -0.213332, T: 49097, Avg. loss: 0.102804
Total training time: 0.02 seconds.
-- Epoch 2
Norm: 15.24, NNZs: 40001, Bias: -0.235146, T: 98194, Avg. loss: 0.096973
Total training time: 0.05 seconds.
-- Epoch 3
Norm: 15.20, NNZs: 40001, Bias: -0.250715, T: 147291, Avg. loss: 0.096549
Total training time: 0.08 seconds.
-- Epoch 4
Norm: 15.19, NNZs: 40001, Bias: -0.262227, T: 196388, Avg. loss: 0.096337
Total training time: 0.10 seconds.
-- Epoch 5
Norm: 15.23, NNZs: 40001, Bias: -0.272824, T: 245485, Avg. loss: 0.096128
Total training time: 0.12 seconds.
-- Epoch 6
Norm: 15.19, NNZs: 40001, Bias: -0.278400, T: 294582, Avg. loss: 0.0962

Total training time: 0.11 seconds.
Convergence after 4 epochs took 0.11 seconds
-- Epoch 1
Norm: 15.62, NNZs: 40001, Bias: -0.229619, T: 49097, Avg. loss: 0.049475
Total training time: 0.03 seconds.
-- Epoch 2
Norm: 15.41, NNZs: 40001, Bias: -0.251304, T: 98194, Avg. loss: 0.044092
Total training time: 0.06 seconds.
-- Epoch 3
Norm: 15.26, NNZs: 40001, Bias: -0.265428, T: 147291, Avg. loss: 0.043250
Total training time: 0.08 seconds.
-- Epoch 4
Norm: 15.28, NNZs: 40001, Bias: -0.279234, T: 196388, Avg. loss: 0.043230
Total training time: 0.11 seconds.
Convergence after 4 epochs took 0.11 seconds
-- Epoch 1
Norm: 18.78, NNZs: 40001, Bias: -0.219013, T: 49097, Avg. loss: 0.051242
Total training time: 0.03 seconds.
-- Epoch 2
Norm: 18.66, NNZs: 40001, Bias: -0.245819, T: 98194, Avg. loss: 0.047561
Total training time: 0.05 seconds.
-- Epoch 3
Norm: 18.60, NNZs: 40001, Bias: -0.260769, T: 147291, Avg. loss: 0.047357
Total training time: 0.08 seconds.
-- Epoch 4
Norm: 18.60, NNZs: 40001, Bi

Total training time: 0.21 seconds.
-- Epoch 6
Norm: 15.87, NNZs: 40001, Bias: -0.296019, T: 294582, Avg. loss: 0.038181
Total training time: 0.24 seconds.
Convergence after 6 epochs took 0.24 seconds
-- Epoch 1
Norm: 15.57, NNZs: 40001, Bias: -0.219365, T: 49097, Avg. loss: 0.055692
Total training time: 0.03 seconds.
-- Epoch 2
Norm: 15.16, NNZs: 40001, Bias: -0.239998, T: 98194, Avg. loss: 0.049598
Total training time: 0.06 seconds.
-- Epoch 3
Norm: 15.11, NNZs: 40001, Bias: -0.257982, T: 147291, Avg. loss: 0.049113
Total training time: 0.10 seconds.
-- Epoch 4
Norm: 15.17, NNZs: 40001, Bias: -0.273185, T: 196388, Avg. loss: 0.048933
Total training time: 0.14 seconds.
-- Epoch 5
Norm: 15.11, NNZs: 40001, Bias: -0.279781, T: 245485, Avg. loss: 0.048950
Total training time: 0.18 seconds.
Convergence after 5 epochs took 0.19 seconds
-- Epoch 1
Norm: 16.21, NNZs: 40001, Bias: -0.208725, T: 49097, Avg. loss: 0.056390
Total training time: 0.04 seconds.
-- Epoch 2
Norm: 16.20, NNZs: 40001, B

Total training time: 0.08 seconds.
-- Epoch 4
Norm: 7.86, NNZs: 40001, Bias: -0.256498, T: 196388, Avg. loss: 0.068544
Total training time: 0.10 seconds.
-- Epoch 5
Norm: 7.81, NNZs: 40001, Bias: -0.263385, T: 245485, Avg. loss: 0.068460
Total training time: 0.13 seconds.
Convergence after 5 epochs took 0.13 seconds


In [None]:
clf.cv_results_

CountVectorizer, MultinominalNb, (4,5) , char_wb


=========== 1 ==========

Cohen Kappa: 0.753482789861

Accuracy: 0.759855201888


             precision    recall  f1-score   support

         ar       0.91      0.93      0.92      2072
         az       0.93      0.85      0.88       960
         be       0.98      0.83      0.90      1008
         bg       0.90      0.85      0.87      1387
         ca       0.88      0.70      0.78      1507
         ce       0.99      0.42      0.59       288
        ceb       1.00      0.23      0.37       154
         cs       0.87      0.82      0.84      1166
         da       0.65      0.77      0.70      1491
         de       0.65      0.92      0.76      2349
         el       0.96      0.88      0.92      1383
         en       0.42      0.92      0.58      2266
         eo       0.86      0.76      0.81      1005
         es       0.42      0.92      0.58      2599
         et       0.98      0.81      0.88       462
         eu       0.95      0.84      0.89       887
         fa       0.96      0.83      0.89       992
         fi       0.96      0.84      0.90       799
         fr       0.67      0.86      0.75      1990
         gl       0.84      0.32      0.46      1071
         he       1.00      0.91      0.96      1153
         hi       0.99      0.82      0.90      1073
         hr       0.73      0.11      0.19       934
         hu       0.97      0.88      0.92      1367
         hy       0.99      0.88      0.94      1132
         id       0.57      0.77      0.66       838
         it       0.66      0.90      0.76      2418
         ja       0.97      0.46      0.62       864
         ka       0.99      0.92      0.96       584
         kk       0.97      0.75      0.85       560
         ko       1.00      0.30      0.46       617
         la       0.90      0.42      0.57       628
      lorem       0.98      1.00      0.99      1683
         lt       0.99      0.81      0.89       785
         ms       0.86      0.18      0.30       619
         nl       0.93      0.81      0.86      1233
         nn       0.86      0.51      0.64       697
         no       0.74      0.50      0.60      1192
         pl       0.85      0.80      0.82      1027
         pt       0.81      0.61      0.70      1356
         ro       0.85      0.78      0.81      1507
         ru       0.73      0.89      0.80      2023
         sh       0.47      0.82      0.60      1326
         sk       0.99      0.47      0.64       540
         sl       0.94      0.36      0.52       785
         sr       0.96      0.78      0.86      1014
         sv       0.87      0.70      0.78      1300
         th       1.00      0.33      0.49       789
         tr       0.91      0.80      0.85      1255
         uk       0.90      0.85      0.87      1715
         ur       1.00      0.79      0.88       457
         uz       0.98      0.74      0.84       445
         vi       0.88      0.87      0.87      1631
         vo       0.97      0.29      0.45       206
        war       0.85      0.50      0.63       274
         zh       1.00      0.04      0.09       845

avg / total       0.83      0.76      0.75     62708

