In [18]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from pathlib import Path

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import spacy
import string


In [4]:
dataset_path = Path.cwd() / Path('dataset/News_Category_Dataset_v2.json')
df = pd.read_json(dataset_path, lines = True)

reduced_df = df.drop(df[df['headline'].str.len() == 0].index, axis=0)

reduced_df = reduced_df[['headline', 'category']]

cleaned_df = pd.DataFrame(reduced_df)

def clean_headline(original_headline):
    # remove special characters
    cleaned_headline = re.sub(r'\W', ' ', original_headline)
    cleaned_headline = re.sub(r'\s+', ' ', original_headline)

    cleaned_headline = cleaned_headline.lower()
    return cleaned_headline

cleaned_df['headline'] = reduced_df['headline'].apply(clean_headline)

cleaned_df.head()

Unnamed: 0,headline,category
0,there were 2 mass shootings in texas last week...,CRIME
1,will smith joins diplo and nicky jam for the 2...,ENTERTAINMENT
2,hugh grant marries for the first time at age 57,ENTERTAINMENT
3,jim carrey blasts 'castrato' adam schiff and d...,ENTERTAINMENT
4,julianna margulies uses donald trump poop bags...,ENTERTAINMENT


In [5]:
# pull the data into vectors
encoder = LabelEncoder()

x = cleaned_df['headline']
y = encoder.fit_transform(cleaned_df['category'])

In [8]:
sample = cleaned_df


from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(
    np.array(x), 
    np.array(y), 
    test_size=0.05, # seperate 5 % test
    random_state=42
)

x_train, x_val, y_train, y_val = train_test_split(
    np.array(x_train), 
    np.array(y_train), 
    test_size=3/19, # this evens out to 80% train 15% validation
    random_state=42
)

print('train:', len(x_train))
print('val:', len(x_val))
print('test:', len(x_test))

train: 160677
val: 30127
test: 10043


In [12]:
vectorizer = Pipeline([
    ('count', CountVectorizer(min_df=5, binary=False, ngram_range=(1,5), stop_words='english')),
    ('tfid', TfidfTransformer())]).fit(x_train)


x_train_vec = vectorizer.transform(x_train)
x_val_vec = vectorizer.transform(x_val)

In [28]:
nb = MultinomialNB(alpha=0.1)
nb.fit(x_train_vec, y_train)

predict = nb.predict(x_val_vec)

print('validation accuracy:', accuracy_score(y_val, predict))

validation accuracy: 0.5481793739834699


In [29]:
x_test_vec = vectorizer.transform(x_test)

predict = nb.predict(x_test_vec)

print('test accuracy:', accuracy_score(y_test, predict))

test accuracy: 0.5579010255899631
