In [11]:
# 12th

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.naive_bayes import GaussianNB

In [3]:
# clf prob is to clf text data into topics(14 classes)

dbpedia_df = pd.read_csv("./datasets/dbpedia_csv/train.csv", skiprows = 1, names = ["Label", "Name", "Text"])

dbpedia_df.sample(6)

Unnamed: 0,Label,Name,Text
368645,10,Setia maculata,Setia maculata is a species of minute sea sna...
432096,11,Chaenactis macrantha,Chaenactis macrantha is a species of flowerin...
69414,2,Rose Mary School,Rose Mary Matriculation Higher Secondary Scho...
492631,13,Komtessen,Komtessen is a 1961 Danish family film direct...
95587,3,Susanne Abbuehl,Susanne Abbuehl (born July 30 1970 in Bern ca...
55745,2,Ash Green School,Ash Green School is an academy school situate...


In [4]:
dbpedia_df.shape

(559999, 3)

In [6]:
# dbpedia_df["Label"].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14],
      dtype=int64)

In [5]:
# sample just 10,000 rows of 10, close to 600k is to big to work with at once
dbpedia_df = dbpedia_df.sample(10000, replace = False)

In [6]:
dbpedia_df.shape

(10000, 3)

In [8]:
X = dbpedia_df["Text"]
Y = dbpedia_df["Label"]

In [9]:
X.head()

422092     Coriaria pottsiana commonly called the Hikura...
394633     Moirainpa amazona is a species of beetle in t...
351053     Burfjord (Kven: Puruvuono; Northern Sami: Buv...
282264     Lake Trahlyta is named for Princess Trahlyta ...
104722     Matthew Savoca (born June 16 1982) is an Amer...
Name: Text, dtype: object

In [30]:
# helper to check model performance
def summarize_classification(y_test, y_pred):
    acc = accuracy_score(y_test, y_pred, normalize = True)
    num_acc = accuracy_score(y_test, y_pred, normalize=False)
    prec = precision_score(y_test, y_pred, average = "weighted")
    recall = recall_score(y_test, y_pred, average = "weighted")
    
    print("Len test data: ", len(y_test))
    print("accuracy count: ", num_acc)
    print("accuracy_score: ", acc)
    print("precision_score: ", prec)
    print("recall_score: ", recall)

In [12]:
vectorizer = HashingVectorizer(n_features = 2**10, norm="l2")

feature_vector = vectorizer.transform(X)

feature_vector.shape

(10000, 1024)

In [13]:
# conv sparse matrix of festure vect to dense vector needed by naiveBaiyers clf
X_dense = feature_vector.todense()

X_dense

matrix([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.21952852, 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ]])

In [16]:
x_train, x_test, y_train, y_test = train_test_split(X_dense, Y, test_size = 0.2)

In [17]:
clf = GaussianNB().fit(x_train, y_train)

In [18]:
y_pred = clf.predict(x_test)

y_pred

array([14, 10,  6, ...,  7,  1, 13], dtype=int64)

In [31]:
summarize_classification(y_test, y_pred)

Len test data:  2000
accuracy count:  1124
accuracy_score:  0.562
precision_score:  0.5742284798144919
recall_score:  0.562


In [32]:
# compare side by side pred vs actual

y_test = np.array(y_test)

pred_results = pd.DataFrame({"y_pred": y_pred,"y_test": y_test})

pred_results.sample(10)

Unnamed: 0,y_pred,y_test
1205,13,13
320,12,12
1681,13,13
407,13,13
1579,5,5
1675,13,14
212,11,11
1243,13,13
1212,1,1
1809,12,12
