In [78]:
import json
import numpy as np

import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import VotingClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier

from sklearn import metrics

In [79]:
file = '../data_resources/topics/vng_training_cleaned_no_news.json'
data_a = json.load(open(file))

additional = '../data_resources/topics/vng_additional_data.json'
data_b = json.load(open(additional))

data = []

for obj in data_a:
    data.append(obj)
for obj in data_b:
    if len(obj['content']) > 1:    
        data.append(obj)

In [80]:
transformer = TfidfVectorizer(smooth_idf=True, max_df=0.3)
# transformer = CountVectorizer() # - descreases performance

corpus = []
y = []

for obj in data:
    content = obj['content']
    corpus.append(content)
    
    y.append(obj['sub_topic'])

In [81]:
X_train_corpus, X_test_corpus, y_train, y_test = train_test_split(corpus, y, test_size=0.4, random_state=42)

X_train = transformer.fit_transform(X_train_corpus)
X_test = transformer.transform(X_test_corpus)

print(np.shape(X_train))

(1908, 22308)


In [116]:
# clf = SVC(kernel="linear", C=3.75, probability=True) # 0.703

clf = SGDClassifier(loss='log', penalty='l1', alpha=1e-6, random_state=42, max_iter=50, tol=None)
%time clf = OneVsRestClassifier(clf).fit(X_train, y_train)

%time y_pred = clf.predict(X_test)

score = accuracy_score(y_test, y_pred)

print(score)

CPU times: user 14.1 s, sys: 0 ns, total: 14.1 s
Wall time: 14.1 s
CPU times: user 40.7 ms, sys: 0 ns, total: 40.7 ms
Wall time: 40.5 ms
0.6677140612725845


In [77]:
print(metrics.classification_report(y_test, y_pred))

                                                     precision    recall  f1-score   support

                                        aanbesteden       0.75      0.75      0.75         4
                              aanpak-radicalisering       0.50      1.00      0.67         1
                 aansluiting-onderwijs-arbeidsmarkt       0.86      0.86      0.86         7
       accountantscontrole-begroting-verantwoording       0.00      0.00      0.00         3
                                              afval       0.62      0.83      0.71         6
                                                apv       0.00      0.00      0.00         0
                                 arbeidsmarktbeleid       0.00      0.00      0.00         2
                              arbeidsomstandigheden       0.50      0.33      0.40         3
                                 arbeidsvoorwaarden       0.70      0.44      0.54        16
                                          archieven       0.85      0

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [109]:
"""
Parameters for tuning:
- tansformer max_df/min_df
- svm C value
- svm kernel
"""

clf1 = SVC(kernel="linear", C=3.75, probability=True) # 0.682
%time clf1 = OneVsRestClassifier(clf1).fit(X_train, y_train)

clf2 = SGDClassifier(loss='log', penalty='l1', alpha=1e-6, random_state=42, max_iter=100, tol=None)

# clf2 = RandomForestClassifier(max_depth=20,n_estimators=30,max_features=5000,n_jobs=-1) # 0.61
%time clf2 = OneVsRestClassifier(clf2).fit(X_train, y_train)

CPU times: user 4min 37s, sys: 80 ms, total: 4min 37s
Wall time: 4min 37s
CPU times: user 27.1 s, sys: 12 ms, total: 27.1 s
Wall time: 27.1 s


In [110]:
clf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2)], voting='soft')
%time clf.fit(X_train, y_train)
%time y_pred = clf.predict(X_test)

score = accuracy_score(y_test, y_pred)

print(score)

CPU times: user 4min 59s, sys: 108 ms, total: 5min
Wall time: 5min
CPU times: user 30 s, sys: 3.99 ms, total: 30 s
Wall time: 30 s
0.6818538884524745


  if diff:


In [114]:
clf2.predict(X_test)
score = accuracy_score(y_test, y_pred)

print(score)

0.6818538884524745


In [116]:

# Classifiers experimented with and score achieved:
classifiers = {
    'svm1': SVC(kernel="linear", C=3.75), # 0.682
    'randomf': RandomForestClassifier(max_depth=20,n_estimators=30,max_features='auto',n_jobs=-1), # 0.61
}

In [None]:
for clf in classifiers:
    print(clf)
    classifier = classifiers[clf]

    %time clf = OneVsRestClassifier(classifier).fit(X_train, y_train)
    %time y_pred = clf.predict(X_test)

    score = accuracy_score(y_test, y_pred)
   
    print(score)

In [None]:
%time clf = OneVsRestClassifier(classifiers['randomf']).fit(X_train, y_train)

y_pred = clf.predict(X_test)

score = accuracy_score(y_test, y_pred)
print(score)