In [19]:
# ovde treba da izvrsimo klasifikaciju na Ebart skupu podataka, koristicemo naivni bajes

import os

import pandas as pd

from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
#  od trening skupa treba nekako da napravimo dokument term matricu

root_path = '../Ebart/Trening'

# ovo ce biti lista gde su smesteni word counts za svaki dokument
corpus = []
# za svaki fajl u trening skupu poznato je kako je klasifikovan, tj da li je sport, politika itd
classes = []

for class_name in os.listdir(root_path):
    #print(class_name)
    class_dir = os.path.join(root_path, class_name)
    for file_name in os.listdir(class_dir):
        with open(os.path.join(class_dir, file_name)) as f:
            word_counts = {}
            for line in f:
                word, count = line.split(' ')
                word_counts[word] = int(count)  # posto ce split vratiti niske mora cast
            corpus.append(word_counts)
            classes.append(class_name)

In [3]:
print( len(corpus) )
print( len(classes) )

3492
3492


In [4]:
# imamo zgodnu klasu DictVectorizer koja za nas moze da napravi term matricu ako uzme listu mapa reci i 
# njihovih pojavljivanja, kao sto smo je napravili
dv = DictVectorizer()

In [5]:
X = dv.fit_transform(corpus)
# opet ovim dobijamo neku numpy retku matricu, a mozemod a je prikazemo i lepo sa .toarray()
print(X.toarray())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 2. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [6]:
# ili jos lepse da napravimo dataframe
# feature_names_ nam daje sve reci koje se nalaze u matrici
features = dv.feature_names_
#print(features)

df = pd.DataFrame(X.toarray(), columns=features)
df.head()

Unnamed: 0,ab,abasu,abati,abc,abdul,abdulah,abe,aberdin,abhaziji,abida,...,zxurno,zxustel,zxustrine,zxustro,zxuticx,zxutih,zxutilovine,zxuto,zxutra,zxuzxa
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
X_train, X_test, y_train, y_test = train_test_split(df, classes, test_size=0.3, stratify=classes)

In [8]:
print(X_train.shape)
print(X_test.shape)

(2444, 36830)
(1048, 36830)


In [9]:
model = MultinomialNB()

In [12]:
model.fit(X_train, y_train)

MultinomialNB()

In [21]:
y_predicted = model.predict(X_test)
print(accuracy_score(y_test, y_predicted))
print(confusion_matrix(y_test, y_predicted))
print(classification_report(y_test, y_predicted))

0.8883587786259542
[[ 83   2   2  13   0]
 [  5 136   1  43   1]
 [  1   3 175   6   3]
 [  3  19   2 255   2]
 [  0   2   3   6 282]]
                 precision    recall  f1-score   support

      Ekonomija       0.90      0.83      0.86       100
HronikaKriminal       0.84      0.73      0.78       186
  KulturaZabava       0.96      0.93      0.94       188
       Politika       0.79      0.91      0.84       281
          Sport       0.98      0.96      0.97       293

       accuracy                           0.89      1048
      macro avg       0.89      0.87      0.88      1048
   weighted avg       0.89      0.89      0.89      1048



In [22]:
# mozemo npr da probamo da koristimo i drugi klasifikator pa da uporedimo rezultate, npr stablo odlucivanja
from sklearn.tree import DecisionTreeClassifier

# mozemo da primetimo da je treniranje stabla odlucivanja trajalo znacajno duze nego treninranje za bajesa
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

DecisionTreeClassifier()

In [24]:
y_predicted = model.predict(X_test)
print(accuracy_score(y_test, y_predicted))
print(confusion_matrix(y_test, y_predicted))
print(classification_report(y_test, y_predicted))

0.7251908396946565
[[ 50  11   3  33   3]
 [  9  99   4  59  15]
 [  7   7 137  15  22]
 [ 17  34   7 211  12]
 [  6   8   5  11 263]]
                 precision    recall  f1-score   support

      Ekonomija       0.56      0.50      0.53       100
HronikaKriminal       0.62      0.53      0.57       186
  KulturaZabava       0.88      0.73      0.80       188
       Politika       0.64      0.75      0.69       281
          Sport       0.83      0.90      0.87       293

       accuracy                           0.73      1048
      macro avg       0.71      0.68      0.69      1048
   weighted avg       0.73      0.73      0.72      1048



In [26]:
# probamo npr i KNN
from sklearn.neighbors import KNeighborsClassifier

# vidimo da je fit relativno brz, ali predict ce biti sporiji
# (zato sto KNN prakticno ne pravi model, jer se svi podaci koriste za odlucivanje)
model = KNeighborsClassifier()
model.fit(X_train, y_train)

KNeighborsClassifier()

In [27]:
y_predicted = model.predict(X_test)
print(accuracy_score(y_test, y_predicted))
print(confusion_matrix(y_test, y_predicted))
print(classification_report(y_test, y_predicted))

0.5896946564885496
[[ 28   1  11  50  10]
 [  2  26  11 100  47]
 [  1   2  72  46  67]
 [  1   8   4 241  27]
 [  0   0   4  38 251]]
                 precision    recall  f1-score   support

      Ekonomija       0.88      0.28      0.42       100
HronikaKriminal       0.70      0.14      0.23       186
  KulturaZabava       0.71      0.38      0.50       188
       Politika       0.51      0.86      0.64       281
          Sport       0.62      0.86      0.72       293

       accuracy                           0.59      1048
      macro avg       0.68      0.50      0.50      1048
   weighted avg       0.65      0.59      0.54      1048

