## Imports, downloading nltk packages

In [1]:
import nltk
import subprocess
from typing import List
import numpy as np

In [2]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/modzyuba1/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/modzyuba1/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/modzyuba1/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import string
import pandas as pd
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, roc_auc_score

## Divide train data into train and validation splits 

In [6]:
train_data = pd.read_csv("./../data/bbc_news_train.csv")

In [8]:
len(train_data)

1192

In [9]:
val_data_csv = train_data.iloc[298:]

In [10]:
train_data_csv = train_data.iloc[298:]

In [60]:
train_data_csv.to_csv("../data/bbc_news_train.csv", index=False)

In [61]:
val_data_csv.to_csv("../data/bbc_news_val.csv", index=False)

## Load and preprocess data

In [36]:
train_data = pd.read_csv("./../data/bbc_news_train.csv", index_col=[0])
val_data = pd.read_csv("./../data/bbc_news_val.csv", index_col=[0])
test_data = pd.read_csv("./../data/bbc_news_test.csv", index_col=[0])

In [37]:
train_data.to_csv("./../data/bbc_news_train.csv", index=False)

In [38]:
val_data.to_csv("./../data/bbc_news_val.csv", index=False)

In [13]:
stop_words = stopwords.words('english')

In [14]:
def clean_text(doc):
    text = "".join([char for char in doc if char not in string.punctuation and not char.isdigit()])
    
    tokens = nltk.word_tokenize(text.lower())
    
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    
    return ' '.join(tokens)

In [15]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, min_df=2, stop_words='english')

In [20]:
train_data.head()

Unnamed: 0,ArticleId,Text,Category
298,25,mobile audio enters new dimension as mobile ph...,tech
299,292,freeze on anti-spam campaign a campaign by lyc...,tech
300,1916,no seasonal lift for house market a swathe of ...,business
301,428,microsoft takes on desktop search microsoft ha...,tech
302,435,s korean lender faces liquidation creditors of...,business


In [21]:
train_data['Clean_text'] = train_data['Text'].apply(lambda x: clean_text(x))
X_train = train_data['Clean_text'].tolist()
train_features = tfidf_vectorizer.fit_transform(X_train)
y_train = train_data['Category'].to_list()

In [28]:
val_data['Clean_text'] = val_data['Text'].apply(lambda x: clean_text(x))
X_val = val_data['Clean_text'].tolist()
val_features = tfidf_vectorizer.transform(X_val)
y_val = val_data['Category'].to_list()

In [None]:
test_data['Clean_text'] = test_data['Text'].apply(lambda x: clean_text(x))
cleaned_texts = test_data['Clean_text'].tolist()
test_features = tfidf_vectorizer.transform(cleaned_texts)

In [22]:
type(X_train)

list

## Train a classifier and print the metrics on a validation set

In [24]:
clf = MultinomialNB().fit(train_features, y_train)

In [30]:
predicted = clf.predict(val_features)

In [34]:
f1_score(y_val, predicted, average='micro')

0.9697986577181208

In [35]:
f1_score(y_val, predicted, average='macro')

0.9679329681550314

In [85]:
metrics = {
    "f1_macro" : (f1_score, "macro"),
    "accuracy" : (accuracy_score, ""),
    "precision_macro" : (precision_score, "macro"),
    "recall_macro" : (recall_score, "macro"),
}

In [86]:
metrics_list = []
values_list = []

In [87]:
for k, v in metrics.items():
    metrics_list.append(k)
    print(k)
    if v[1] == "":
        values_list.append(v[0](y_val, predicted))
    else:
        print(v[1])
        values_list.append(v[0](y_val, predicted, average=v[1]))

f1_macro
macro
accuracy
precision_macro
macro
recall_macro
macro


In [84]:
metrics_list

['f1_micro', 'accuracy', 'precision_micro', 'recall_micro']

In [88]:
values_list

[0.983424552343841, 0.9839142091152815, 0.9853081421502473, 0.9818761178180193]

## Predict on a test set 

In [21]:
predicted = clf.predict(test_features)

In [24]:
len(test_data['ArticleId'].to_list())

735

In [25]:
len(test_data)

735

In [26]:
len(predicted.tolist())

735

In [28]:
submission = pd.DataFrame({'ArticleId': test_data['ArticleId'].to_list(), 'Category': predicted.tolist()}) 

In [None]:
submission.to_csv('submission.csv', index=False)