In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from pathlib import Path

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score

In [2]:
datasets = {}

# Load uci
dataset_path = Path.cwd() / Path('dataset/cleaned/uci-news-aggregator.csv')
datasets['uci'] = pd.read_csv(dataset_path)

# Load news_v2
dataset_path = Path.cwd() / Path('dataset/cleaned/News_Category_Dataset_v2.csv')
datasets['news_v2'] = pd.read_csv(dataset_path)

In [3]:
# pull the data into vectors
def encode_labels(dataset):
    encoder = LabelEncoder()

    x = dataset['headline']
    y = encoder.fit_transform(dataset['category'])
    
    return x, y

In [4]:
def train_val_test_split(features, labels):
    x_train, x_test, y_train, y_test = train_test_split(
        np.array(features), 
        np.array(labels), 
        test_size=0.05, # 5 % test
        random_state=42
    )

    x_train, x_val, y_train, y_val = train_test_split(
        np.array(x_train), 
        np.array(y_train), 
        test_size=3/19, # this evens out to 80% train 15% validation
        random_state=42
    )

    print('  train:', len(x_train))
    print('  val:', len(x_val))
    print('  test:', len(x_test))
    
    return x_train, x_test, x_val, y_val, y_train, y_test

In [5]:
def fit_vectorizer(training_data):
    vectorizer = Pipeline([
        ('count', CountVectorizer(min_df=5, binary=False, ngram_range=(1,5), stop_words='english')),
        ('tfid', TfidfTransformer())
    ]).fit(training_data)
    return vectorizer

def transform_input(vectorizer, x_train, x_val, x_test):
    x_train_vec = vectorizer.transform(x_train)
    x_val_vec = vectorizer.transform(x_val)
    x_test_vec = vectorizer.transform(x_test)
    
    return x_train_vec, x_val_vec, x_test_vec

In [6]:
def train_model(x_train_vec, y_train):
    nb = MultinomialNB(alpha=0.1)
    nb.fit(x_train_vec, y_train)
    return nb

In [7]:
for dataset_name, dataset in datasets.items():   
    print('Train Val Test Split', dataset_name)
    features, labels = encode_labels(dataset)
    x_train, x_test, x_val, y_val, y_train, y_test = train_val_test_split(features, labels)
    
    print('Train on', dataset_name)
    vectorizer = fit_vectorizer(x_train)
    x_train_vec, x_val_vec, x_test_vec = transform_input(vectorizer, x_train, x_val, x_test)
    nb = train_model(x_train_vec, y_train)
    
    print('Evaluate Model')
    predict = nb.predict(x_val_vec)
    print('  Validation Accuracy:', accuracy_score(y_val, predict))
    
    predict = nb.predict(x_test_vec)
    print('  Test Accuracy:', accuracy_score(y_test, predict))
    print('')

Train Val Test Split uci
  train: 337935
  val: 63363
  test: 21121
Train on uci
Evaluate Model
  Validation Accuracy: 0.9456780771112479
  Test Accuracy: 0.943752663226173

Train Val Test Split news_v2
  train: 160677
  val: 30127
  test: 10043
Train on news_v2
Evaluate Model
  Validation Accuracy: 0.5481793739834699
  Test Accuracy: 0.5579010255899631

