In [5]:
import json
import numpy as np

import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import VotingClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [6]:
file = '../data_resources/topics/vng_training_cleaned_no_news.json'
data_a = json.load(open(file))

additional = '../data_resources/topics/vng_additional_data.json'
data_b = json.load(open(additional))

data = []

for obj in data_a:
    data.append(obj)
for obj in data_b:
    if len(obj['content']) > 1:    
        data.append(obj)

In [7]:
transformer = TfidfVectorizer(smooth_idf=True, max_df=0.3)
# transformer = CountVectorizer() # - descreases performance

corpus = []
y = []

for obj in data:
    content = obj['content']
    corpus.append(content)
    
    y.append(obj['sub_topic'])

In [8]:
X_train, X_test, y_train, y_test = train_test_split(corpus, y, test_size=0.2, random_state=42)


X_train = transformer.fit_transform(X_train)
X_test = transformer.transform(X_test)

In [10]:
clf = SVC(kernel="linear", C=3.75, probability=True) # 0.682
%time clf = OneVsRestClassifier(clf).fit(X_train, y_train)

%time y_pred = clf.predict(X_test)

score = accuracy_score(y_test, y_pred)

print(score)

CPU times: user 6min 40s, sys: 27.5 ms, total: 6min 40s
Wall time: 6min 40s
CPU times: user 17.2 s, sys: 8 ms, total: 17.2 s
Wall time: 17.3 s
0.7032967032967034


In [None]:
"""
Parameters for tuning:
- tansformer max_df/min_df
- svm C value
- svm kernel
"""

clf1 = SVC(kernel="linear", C=3.75, probability=True) # 0.682
%time clf1 = OneVsRestClassifier(clf1).fit(X_train, y_train)


clf2 = RandomForestClassifier(max_depth=20,n_estimators=30,max_features=5000,n_jobs=-1) # 0.61
%time clf2 = OneVsRestClassifier(clf2).fit(X_train, y_train)

In [None]:
clf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2)], voting='soft')
%time clf.fit(X_train, y_train)
%time y_pred = clf.predict(X_test)

score = accuracy_score(y_test, y_pred)

print(score)

In [116]:

# Classifiers experimented with and score achieved:
classifiers = {
    'svm1': SVC(kernel="linear", C=3.75), # 0.682
    'randomf': RandomForestClassifier(max_depth=20,n_estimators=30,max_features='auto',n_jobs=-1), # 0.61
}

In [None]:
for clf in classifiers:
    print(clf)
    classifier = classifiers[clf]

    %time clf = OneVsRestClassifier(classifier).fit(X_train, y_train)
    %time y_pred = clf.predict(X_test)

    score = accuracy_score(y_test, y_pred)
   
    print(score)

In [None]:
%time clf = OneVsRestClassifier(classifiers['randomf']).fit(X_train, y_train)

y_pred = clf.predict(X_test)

score = accuracy_score(y_test, y_pred)
print(score)