1. Import modules

In [1]:
import pandas as pd
import numpy as np
import os.path
import math
import random
import statistics
import datetime
import collections
import os
import nltk
from matplotlib import pyplot as plt

from sklearn import cluster, linear_model, svm, naive_bayes, neighbors, ensemble
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.externals import joblib
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid

2. Load trainingdata

In [8]:
df = pd.read_csv("training_konle.tsv", sep="\t", encoding="utf-8", names=["text", "speech"])

3. Prepare trainingdata

In [9]:
X_raw = df.text.values.astype('U')
Y_all = np.array(df.speech)
count_vect = CountVectorizer(min_df=1, analyzer="word", token_pattern="\w+|[.,-;:*+!?]", encoding="utf-8")
X_all = count_vect.fit_transform(X_raw)
joblib.dump(count_vect, 'count_vect.pkl')

['count_vect.pkl']

4. Sample Training data (func)

In [10]:
def get_sample_data(n):
    indx = random.sample(range(len(Y_all)), n)
    X_ = X_all[indx]
    Y_ = Y_all[indx]
    return X_, Y_

5. Call sample function

In [11]:
X, Y = get_sample_data(3000)
print(np.shape(X))

(3000, 7741)


6. Multinominal Naive Baiyes 

In [18]:
clf = naive_bayes.MultinomialNB()

#parameter grid
alpha = [0.01, .1, 1, 1.5, 1.75, 2, 2.5, 3, 4, 5, 6]
fit_prior = [True, False]

clf = GridSearchCV(estimator=clf, param_grid=dict(alpha=alpha, fit_prior=fit_prior), n_jobs=-1)
clf.fit(X_all, Y_all)
print("Best score: {}".format(clf.best_score_))
print("Best value for alpha: {}".format(clf.best_estimator_.alpha))
print("Best value for fit_prior: {}".format(clf.best_estimator_.fit_prior))

Best score: 0.8366489935719983
Best value for alpha: 1.75
Best value for fit_prior: True


7. Save the classifier

In [27]:
clf = naive_bayes.MultinomialNB(alpha=0.01, fit_prior=True)
scores = cross_val_score(clf, X_all, Y_all, cv=3)
clf.fit(X_all, Y_all)
joblib.dump(clf, 'ds_konle_clf.pkl')
print("Acc (mean): {} (std): {}".format(scores.mean(), scores.std()))

Acc (mean): 0.8330359256159748 (std): 0.004211117518812063
