In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

from io import StringIO
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
from IPython.display import display
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics
#import warnings

# Basic data cleaning
Only keep headlines and category for now
Remove special characters from headlines and convert to lowercase

In [4]:
dataset_path = Path.cwd() / Path('dataset/News_Category_Dataset_v2.json')
df = pd.read_json(dataset_path, lines = True)

In [5]:
reduced_df = df.drop(df[df['headline'].str.len() == 0].index, axis=0)

reduced_df = reduced_df[['headline', 'category']]

In [6]:
cleaned_df = pd.DataFrame(reduced_df)

def clean_headline(original_headline):
    # remove special characters
    cleaned_headline = re.sub(r'\W', ' ', original_headline)
    cleaned_headline = re.sub(r'\s+', ' ', original_headline)

    cleaned_headline = cleaned_headline.lower()
    return cleaned_headline

cleaned_df['headline'] = reduced_df['headline'].apply(clean_headline)

cleaned_df.head()

Unnamed: 0,headline,category
0,there were 2 mass shootings in texas last week...,CRIME
1,will smith joins diplo and nicky jam for the 2...,ENTERTAINMENT
2,hugh grant marries for the first time at age 57,ENTERTAINMENT
3,jim carrey blasts 'castrato' adam schiff and d...,ENTERTAINMENT
4,julianna margulies uses donald trump poop bags...,ENTERTAINMENT


Convert categorical labels to integer labels

In [8]:
unique_categories = cleaned_df['category'].unique()
numbered_categories = {category_name: category_id for category_id, category_name in enumerate(unique_categories)}

cleaned_df['category_id'] = cleaned_df['category'].apply(lambda category_name: numbered_categories[category_name])
cleaned_df.head()

Unnamed: 0,headline,category,category_id
0,there were 2 mass shootings in texas last week...,CRIME,0
1,will smith joins diplo and nicky jam for the 2...,ENTERTAINMENT,1
2,hugh grant marries for the first time at age 57,ENTERTAINMENT,1
3,jim carrey blasts 'castrato' adam schiff and d...,ENTERTAINMENT,1
4,julianna margulies uses donald trump poop bags...,ENTERTAINMENT,1


# train, val, test split
Split the dataset as specified in the task (80/15/5)
Also additionally reduce the number of samples to 70000 as the 16GB I have availble on my machine are not enough for more data

In [9]:
# Had to limit the data due to my limited memory
sample = cleaned_df.sample(70000, random_state=42)


from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(
    np.array(sample['headline']), 
    np.array(sample['category_id']), 
    test_size=0.05, # seperate 5 % test
    random_state=42
)

x_train, x_val, y_train, y_val = train_test_split(
    np.array(x_train), 
    np.array(y_train), 
    test_size=3/19, # this evens out to 80% train 15% validation
    random_state=42
)

print('train:', len(x_train))
print('val:', len(x_val))
print('test:', len(x_test))

train: 56000
val: 10500
test: 3500


Fit a vectorizer to see if a simple tfidf solution is enough to get a good result already

In [10]:
tfidf = TfidfVectorizer(
    sublinear_tf=True, 
    min_df=5,
    ngram_range=(1, 3),
    stop_words='english'
)

features = tfidf.fit_transform(x_train).toarray()
labels = y_train

print(features.shape)

(56000, 14027)


Test some classifiers on the validation set

In [11]:
models = [
    RandomForestClassifier(n_estimators=10, max_depth=10, random_state=0),
    LinearSVC(),
    MultinomialNB(),
]

entries = []
for model in models:
    model_name = model.__class__.__name__
    
    print('Training', model_name)
    clf = model.fit(features, labels)

    print('Predicting', model_name)
    y_pred = clf.predict(tfidf.transform(x_val))

    model_accuracy = accuracy_score(y_val, y_pred)
    print(model_name, 'Accuracy', model_accuracy)
    entries.append((model_name, model_accuracy))
    print('')
    
results= pd.DataFrame(entries, columns=['model_name', 'accuracy'])

Training RandomForestClassifier
Predicting RandomForestClassifier
RandomForestClassifier Accuracy 0.1921904761904762

Training LinearSVC
Predicting LinearSVC
LinearSVC Accuracy 0.5414285714285715

Training MultinomialNB
Predicting MultinomialNB
MultinomialNB Accuracy 0.4389523809523809



Compute accuracy for all three classifiers

In [12]:
accuracy = results.groupby('model_name')['accuracy'].mean()

acc = pd.concat([accuracy], axis= 1, ignore_index=True)
acc.columns = ['Accuracy']
acc

Unnamed: 0_level_0,Accuracy
model_name,Unnamed: 1_level_1
LinearSVC,0.541429
MultinomialNB,0.438952
RandomForestClassifier,0.19219
