In [1]:
# 14th

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB

In [2]:
# clf prob is to clf text data into topics(14 classes)

dbpedia_df = pd.read_csv("./datasets/dbpedia_csv/train.csv", skiprows = 1, names = ["Label", "Name", "Text"])

dbpedia_df.sample(6)

Unnamed: 0,Label,Name,Text
418320,11,Teliostachya alopecuroidea,Pata de gallina (syn. Lepidagathis alopecuroi...
491240,13,Ju Dou,Ju Dou (Chinese: 菊豆; pinyin: Jú Dòu) is a 199...
496062,13,Front of the Class (film),Front of the Class is a 2008 American drama f...
257851,7,Fort Tombecbe,Fort Tombecbe (Fort de Tombecbé) also spelled...
31980,1,Arriva Southern Counties,Arriva Southern Counties is a bus operator in...
428174,11,Saxifraga rivularis,Saxifraga rivularis is a species of saxifrage...


In [3]:
dbpedia_df.shape

(559999, 3)

In [6]:
# dbpedia_df["Label"].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14],
      dtype=int64)

In [4]:
# sample just 10,000 rows of 10, close to 600k is to big to work with at once
dbpedia_df = dbpedia_df.sample(10000, replace = False)

In [5]:
dbpedia_df.shape

(10000, 3)

In [6]:
X = dbpedia_df["Text"]
Y = dbpedia_df["Label"]

In [7]:
X.head()

494271     Voices Within: The Lives of Truddi Chase is a...
415867     Diplazium sibiricum is a species of fern. It ...
319537     The Pipp Brook is a tributary of the River Mo...
508432     The Lost String (also known as La Corde Perdu...
362815     The White-winged Warbler (Xenoligea montana) ...
Name: Text, dtype: object

In [8]:
# helper to check model performance
def summarize_classification(y_test, y_pred):
    acc = accuracy_score(y_test, y_pred, normalize = True)
    num_acc = accuracy_score(y_test, y_pred, normalize=False)
    prec = precision_score(y_test, y_pred, average = "weighted")
    recall = recall_score(y_test, y_pred, average = "weighted")
    
    print("Len test data: ", len(y_test))
    print("accuracy count: ", num_acc)
    print("accuracy_score: ", acc)
    print("precision_score: ", prec)
    print("recall_score: ", recall)

In [9]:
count_vectorizer = CountVectorizer()

feature_vector = count_vectorizer.fit_transform(X)

feature_vector.shape

(10000, 48176)

In [10]:
# conv sparse matrix of festure vect to dense vector needed by naiveBaiyers clf
X_dense = feature_vector.todense()

X_dense

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [11]:
x_train, x_test, y_train, y_test = train_test_split(X_dense, Y, test_size = 0.2)

In [12]:
clf = GaussianNB().fit(x_train, y_train)

In [13]:
y_pred = clf.predict(x_test)

y_pred

array([10, 12, 10, ..., 14, 10,  1], dtype=int64)

In [14]:
summarize_classification(y_test, y_pred)

Len test data:  2000
accuracy count:  1480
accuracy_score:  0.74
precision_score:  0.7463945891895961
recall_score:  0.74


In [36]:
# compare side by side pred vs actual

y_test = np.array(y_test)

pred_results = pd.DataFrame({"y_pred": y_pred,"y_test": y_test})

pred_results.sample(10)

Unnamed: 0,y_pred,y_test
1905,6,6
1157,10,10
1120,2,2
1892,14,10
1946,13,13
762,13,3
1528,8,8
1794,1,1
604,6,6
704,9,9
