In [1]:
import os
import numpy as np
import pandas as pd
import re

from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split

# Load cleaned datasets

In [2]:
datasets = {}

# Load uci
dataset_path = Path.cwd() / Path('dataset/cleaned/uci-news-aggregator.csv')
datasets['uci'] = pd.read_csv(dataset_path)

# Load news_v2
dataset_path = Path.cwd() / Path('dataset/cleaned/News_Category_Dataset_v2.csv')
datasets['news_v2'] = pd.read_csv(dataset_path)

Convert categorical labels to integer labels

In [3]:
def transform_labels(dataset):
    unique_categories = dataset['category'].unique()
    numbered_categories = {category_name: category_id for category_id, category_name in enumerate(unique_categories)}

    dataset['category'] = dataset['category'].apply(lambda category_name: numbered_categories[category_name])
    return dataset

# train, val, test split
Split the dataset as specified in the task (80/15/5)

In [4]:
def train_val_test_split(features, labels):
    x_train, x_test, y_train, y_test = train_test_split(
        np.array(features), 
        np.array(labels), 
        test_size=0.05, # 5 % test
        random_state=42
    )

    x_train, x_val, y_train, y_val = train_test_split(
        np.array(x_train), 
        np.array(y_train), 
        test_size=3/19, # this evens out to 80% train 15% validation
        random_state=42
    )

    print('  train:', len(x_train))
    print('  val:', len(x_val))
    print('  test:', len(x_test))
    
    return x_train, x_test, x_val, y_val, y_train, y_test

Fit a vectorizer to see if a simple tfidf solution is enough to get a good result already

In [5]:
def fit_vectorizer(x_train, y_train):
    tfidf = TfidfVectorizer(
        sublinear_tf=True, 
        min_df=5,
        ngram_range=(1, 3),
        stop_words='english'
    )

    features = tfidf.fit_transform(x_train).toarray()
    labels = y_train
    
    return features, labels, tfidf

Test some classifiers on the validation set

In [6]:
def train_model(model_type, features, labels):
    clf = model_type.fit(features, labels)
    return clf

Also additionally reduce the number of samples to 70000 as the 16GB I have availble on my machine are not enough for more data

In [9]:
for dataset_name, dataset in datasets.items(): 
    print('Using dataset', dataset_name)
    
    # Had to limit the data due to my limited memory
    dataset = transform_labels(dataset)
    dataset = dataset.sample(70000, random_state=42)
    
    features = dataset['headline'] 
    labels = dataset['category']
    
    x_train, x_test, x_val, y_val, y_train, y_test = train_val_test_split(features, labels)
    
    x_train, y_train, tfidf = fit_vectorizer(x_train, y_train)
    
    models = [
        RandomForestClassifier(n_estimators=10, max_depth=10, random_state=0),
        LinearSVC(),
        MultinomialNB(),
        LogisticRegression(),
    ]

    for model_type in models:
        model_name = model_type.__class__.__name__
        
        print('Training', model_name)
        model = train_model(model_type, x_train, y_train)

        print('  Predicting...')
        y_pred = model.predict(tfidf.transform(x_val))
        model_accuracy = accuracy_score(y_val, y_pred)
        print('    Validation Accuracy', model_accuracy)

        y_pred = model.predict(tfidf.transform(x_test))
        model_accuracy = accuracy_score(y_test, y_pred)
        print('    Test Accuracy', model_accuracy)
    print('')
    

Using dataset uci
  train: 56000
  val: 10500
  test: 3500
Training RandomForestClassifier
  Predicting...
    Validation Accuracy 0.5144761904761905
    Test Accuracy 0.5031428571428571
Training LinearSVC
  Predicting...
    Validation Accuracy 0.9244761904761905
    Test Accuracy 0.9251428571428572
Training MultinomialNB
  Predicting...
    Validation Accuracy 0.91
    Test Accuracy 0.9137142857142857
Training LogisticRegression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


  Predicting...
    Validation Accuracy 0.917047619047619
    Test Accuracy 0.9202857142857143

Using dataset news_v2
  train: 56000
  val: 10500
  test: 3500
Training RandomForestClassifier
  Predicting...
    Validation Accuracy 0.1921904761904762
    Test Accuracy 0.19428571428571428
Training LinearSVC
  Predicting...
    Validation Accuracy 0.5414285714285715
    Test Accuracy 0.538
Training MultinomialNB
  Predicting...
    Validation Accuracy 0.4389523809523809
    Test Accuracy 0.43714285714285717
Training LogisticRegression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


  Predicting...
    Validation Accuracy 0.5402857142857143
    Test Accuracy 0.5345714285714286

