In [17]:
# 13th

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.naive_bayes import GaussianNB

In [18]:
# clf prob is to clf text data into topics(14 classes)

dbpedia_df = pd.read_csv("./datasets/dbpedia_csv/train.csv", skiprows = 1, names = ["Label", "Name", "Text"])

dbpedia_df.sample(6)

Unnamed: 0,Label,Name,Text
428815,11,Uapaca,Uapaca is a genus of plant in the family Phyl...
268009,7,Belvoir (plantation),Belvoir was the historic plantation and estat...
7227,1,Bristol Technology Inc.,Bristol Technology Inc. was a software develo...
19580,1,Jobandtalent,jobandtalent is a recruiting platform based o...
57860,2,Osaka Junior College of Music,Osaka Junior College of Music (大阪音楽大学短期大学部 Ōs...
165683,5,Jagannath Pahadia,Jagannath Pahadia (born January 15 1932) is a...


In [19]:
dbpedia_df.shape

(559999, 3)

In [6]:
# dbpedia_df["Label"].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14],
      dtype=int64)

In [21]:
# sample just 10,000 rows of 10, close to 600k is to big to work with at once
dbpedia_df = dbpedia_df.sample(10000, replace = False)

In [22]:
dbpedia_df.shape

(10000, 3)

In [23]:
X = dbpedia_df["Text"]
Y = dbpedia_df["Label"]

In [24]:
X.head()

217657     USS Sylph (PY-5) was a steam yacht that serve...
423218     Teyleria is a genus of flowering plants in th...
556612     The Barbecue Bible by Steven Raichlen (1998 W...
543476     Nature is a prominent interdisciplinary scien...
480633     Aaj Ki Awaaz (Hindi: आज की आवाज़ Urdu: آج کی ...
Name: Text, dtype: object

In [26]:
# helper to check model performance
def summarize_classification(y_test, y_pred):
    acc = accuracy_score(y_test, y_pred, normalize = True)
    num_acc = accuracy_score(y_test, y_pred, normalize=False)
    prec = precision_score(y_test, y_pred, average = "weighted")
    recall = recall_score(y_test, y_pred, average = "weighted")
    
    print("Len test data: ", len(y_test))
    print("accuracy count: ", num_acc)
    print("accuracy_score: ", acc)
    print("precision_score: ", prec)
    print("recall_score: ", recall)

In [29]:
from nltk.stem import SnowballStemmer

stemmer = SnowballStemmer("english")
analyzer = HashingVectorizer().build_analyzer()  # analyzer to get word tokens from sen before stemming

def stemmed_words(doc):
    return (stemmer.stem(word) for  word in analyzer(doc))

In [30]:
vectorizer = HashingVectorizer(n_features = 2**10, norm="l2", analyzer=stemmed_words)

feature_vector = vectorizer.transform(X)

feature_vector.shape

(10000, 1024)

In [31]:
# conv sparse matrix of festure vect to dense vector needed by naiveBaiyers clf
X_dense = feature_vector.todense()

X_dense

matrix([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.18257419, 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ]])

In [32]:
x_train, x_test, y_train, y_test = train_test_split(X_dense, Y, test_size = 0.2)

In [33]:
clf = GaussianNB().fit(x_train, y_train)

In [34]:
y_pred = clf.predict(x_test)

y_pred

array([11, 11,  3, ...,  7,  4, 13], dtype=int64)

In [35]:
summarize_classification(y_test, y_pred)

Len test data:  2000
accuracy count:  1090
accuracy_score:  0.545
precision_score:  0.5624306384721938
recall_score:  0.545


In [36]:
# compare side by side pred vs actual

y_test = np.array(y_test)

pred_results = pd.DataFrame({"y_pred": y_pred,"y_test": y_test})

pred_results.sample(10)

Unnamed: 0,y_pred,y_test
1905,6,6
1157,10,10
1120,2,2
1892,14,10
1946,13,13
762,13,3
1528,8,8
1794,1,1
604,6,6
704,9,9
