In [1]:
# 16th

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.naive_bayes import GaussianNB
from nltk.probability import FreqDist

In [2]:
# clf prob is to clf text data into topics(14 classes)

dbpedia_df = pd.read_csv("./datasets/dbpedia_csv/train.csv", skiprows = 1, names = ["Label", "Name", "Text"])

dbpedia_df.sample(6)

Unnamed: 0,Label,Name,Text
77496,2,Southwest Middle School (Gastonia North Carolina),Southwest Middle School is a public middle sc...
212248,6,Lekiu-class frigate,The Lekiu-class frigates are presently the mo...
63001,2,North Caroline High School,North Caroline High School is located in Ridg...
446914,12,Viva Terlingua,¡Viva Terlingua! is a live progressive countr...
308333,8,Dobreanu River (Neamț),The Dobreanu River is a tributary of the Neam...
398749,10,Orthotylus stratensis,Orthotylus stratensis is a species of bug fro...


In [3]:
dbpedia_df.shape

(559999, 3)

In [6]:
# dbpedia_df["Label"].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14],
      dtype=int64)

In [4]:
# sample just 10,000 rows of 10, close to 600k is to big to work with at once
dbpedia_df = dbpedia_df.sample(10000, replace = False)

In [5]:
dbpedia_df.shape

(10000, 3)

In [6]:
X = dbpedia_df["Text"]
Y = dbpedia_df["Label"]

In [7]:
X.head()

184389     Brian S. Loughmiller is the current mayor of ...
103027     Giorgio van Straten (born 1955) is an Italian...
295629          The Feilebach is a river of Saxony Germany.
300265     The Willberg River (also spelt Wilberg River)...
59814      The Britton-Macon Area School District often ...
Name: Text, dtype: object

In [8]:
# helper to check model performance
def summarize_classification(y_test, y_pred):
    acc = accuracy_score(y_test, y_pred, normalize = True)
    num_acc = accuracy_score(y_test, y_pred, normalize=False)
    prec = precision_score(y_test, y_pred, average = "weighted")
    recall = recall_score(y_test, y_pred, average = "weighted")
    
    print("Len test data: ", len(y_test))
    print("accuracy count: ", num_acc)
    print("accuracy_score: ", acc)
    print("precision_score: ", prec)
    print("recall_score: ", recall)

In [9]:
# tokenize words
from nltk.tokenize import word_tokenize

tokens = word_tokenize("\n".join(X.values))

len(tokens)

507174

In [10]:
freq = FreqDist(tokens)

freq

FreqDist({'the': 23663, '.': 23531, 'in': 16312, 'of': 15663, 'is': 13555, 'a': 13181, 'and': 12908, '(': 7122, ')': 7104, 'was': 6930, ...})

In [11]:
# freq filtering that removes words with freq > 100
frequent_words = []

for key, value in freq.items():
    if value >= 100:
        frequent_words.append(key.lower())
        
len(frequent_words)

503

In [12]:
frequent_words[:25]

['is',
 'the',
 'current',
 'of',
 'texas',
 'as',
 'well',
 'a',
 'was',
 'elected',
 'in',
 '2009',
 'after',
 'two',
 'member',
 'district',
 'currently',
 '.',
 'based',
 'family',
 '(',
 'born',
 ')',
 'an',
 'italian']

In [13]:
from nltk.corpus import stopwords
from sklearn.feature_extraction import text

stop_words = text.ENGLISH_STOP_WORDS.union(frequent_words)  # adds our most freq words based on thresh to stopwords list

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(stop_words = stop_words)

feature_vector = count_vectorizer.fit_transform(X)

feature_vector.shape



(10000, 47024)

In [15]:
X_dense = feature_vector.todense()

In [16]:
x_train, x_test, y_train, y_test = train_test_split(X_dense, Y, test_size = 0.2)

In [20]:
clf = GaussianNB().fit(x_train, y_train)

y_pred = clf.predict(x_test)

y_pred

array([11,  4,  6, ...,  7,  6, 14], dtype=int64)

In [23]:
summarize_classification(y_test, y_pred)

Len test data:  2000
accuracy count:  1385
accuracy_score:  0.6925
precision_score:  0.6926660343756493
recall_score:  0.6925
