In [10]:
import pandas as pd
import os
from pathlib import Path

import numpy as np
import re
import nltk
from sklearn.datasets import load_files
import pickle

from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn import metrics

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import train_test_split

import string
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

punctuations = string.punctuation

# Load spacy stopwords
nlp = spacy.load('en')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

In [11]:
datasets = {}

# Load uci
dataset_path = Path.cwd() / Path('dataset/cleaned/uci-news-aggregator.csv')
datasets['uci'] = pd.read_csv(dataset_path)

# Load news_v2
dataset_path = Path.cwd() / Path('dataset/cleaned/News_Category_Dataset_v2.csv')
datasets['news_v2'] = pd.read_csv(dataset_path)

# train, val, test split
Split the dataset as specified in the task (80/15/5) 

In [12]:
def train_val_test_split(features, labels):
    x_train, x_test, y_train, y_test = train_test_split(
        np.array(features), 
        np.array(labels), 
        test_size=0.05, # 5 % test
        random_state=42
    )

    x_train, x_val, y_train, y_val = train_test_split(
        np.array(x_train), 
        np.array(y_train), 
        test_size=3/19, # this evens out to 80% train 15% validation
        random_state=42
    )

    print('  train:', len(x_train))
    print('  val:', len(x_val))
    print('  test:', len(x_test))
    
    return x_train, x_test, x_val, y_val, y_train, y_test

In [13]:
# Creating our tokenizer function
def spacy_tokenizer(sentence):
    mytokens = parser(sentence)

    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    return mytokens

In [14]:
# Custom transformer using spaCy
class predictors(TransformerMixin):
    def transform(self, x, **transform_params):
        # Cleaning Text
        return [clean_text(text) for text in x]

    def fit(self, x, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

# Basic function to clean the text
def clean_text(text):
    # Removing spaces and converting text into lowercase
    return text.strip().lower()

I tested both simple vectorizers for spacy
tfidf performed much better

In [15]:
def train_model(vectorizer, classifier, x_train,y_train):
    # Create spacy pipeline
    pipe = Pipeline([
        ('cleaner', predictors()),
        ('vectorizer', vectorizer),
        ('classifier', classifier)
    ])

    # model generation
    pipe.fit(x_train,y_train)
    
    return pipe

# Select classifier & vectorizer

In [16]:
# testing simple bow vectorizer
bow_vector = CountVectorizer(
    tokenizer=spacy_tokenizer, 
    ngram_range=(1,1)
)

# testing tfidf vectorizer
tfidf_vector = TfidfVectorizer(
    tokenizer=spacy_tokenizer,
    min_df=3,
    ngram_range=(1,5),
)

#classifier = LogisticRegression()
classifier = LinearSVC()
# classifier = SVC() # takes too long to train (more than 8 hours for news_v2)

In [17]:
for dataset_name, dataset in datasets.items():  
    features = dataset['headline'] 
    labels = dataset['category']
    
    print('Train Val Test Split', dataset_name)   
    x_train, x_test, x_val, y_val, y_train, y_test = train_val_test_split(features, labels)
    
    print('Train on', dataset_name)
    model = train_model(tfidf_vector, classifier, x_train, y_train)
    
    print('Evaluate Model')
    # Predicting with a val dataset
    predicted = model.predict(x_val)
    print('  Validation Accuracy:', metrics.accuracy_score(y_val, predicted))
    
    print(metrics.classification_report(y_val, predicted))
    
    # Predicting with a test dataset
    predicted = model.predict(x_test)
    print('  Test Accuracy:', metrics.accuracy_score(y_test, predicted))
    
    print(metrics.classification_report(y_test, predicted))
    print('')
    
    

Train Val Test Split uci
  train: 337935
  val: 63363
  test: 21121
Train on uci
Evaluate Model
  Validation Accuracy: 0.9577671511765541
              precision    recall  f1-score   support

           b       0.94      0.94      0.94     17368
           e       0.98      0.98      0.98     22845
           m       0.97      0.94      0.95      6932
           t       0.94      0.94      0.94     16218

    accuracy                           0.96     63363
   macro avg       0.96      0.95      0.96     63363
weighted avg       0.96      0.96      0.96     63363

  Test Accuracy: 0.9569149188011932
              precision    recall  f1-score   support

           b       0.94      0.94      0.94      5857
           e       0.98      0.98      0.98      7647
           m       0.96      0.95      0.95      2207
           t       0.94      0.95      0.95      5410

    accuracy                           0.96     21121
   macro avg       0.96      0.95      0.95     21121
weighted av