## Imports, downloading nltk packages

In [56]:
import nltk
import subprocess
from typing import List
import numpy as np

In [57]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/modzyuba1/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [58]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/modzyuba1/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [59]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/modzyuba1/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [120]:
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import string
import pandas as pd
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, roc_auc_score
import configparser
import os
import sys
from predict import Predictor
from train import Trainer
from validate import Validator
from preprocess import Preprocessor
from sklearn.dummy import DummyClassifier
from utils import save_ckpt
from typing import List, Tuple

## Divide train data into train and validation splits 

In [51]:
train_data = pd.read_csv("./../data/bbc_news_train.csv")

In [8]:
len(train_data)

1192

In [9]:
val_data_csv = train_data.iloc[298:]

In [10]:
train_data_csv = train_data.iloc[298:]

In [60]:
train_data_csv.to_csv("../data/bbc_news_train.csv", index=False)

In [61]:
val_data_csv.to_csv("../data/bbc_news_val.csv", index=False)

## Load and preprocess data

In [36]:
train_data = pd.read_csv("./../data/bbc_news_train.csv", index_col=[0])
val_data = pd.read_csv("./../data/bbc_news_val.csv", index_col=[0])
test_data = pd.read_csv("./../data/bbc_news_test.csv", index_col=[0])

In [37]:
train_data.to_csv("./../data/bbc_news_train.csv", index=False)

In [38]:
val_data.to_csv("./../data/bbc_news_val.csv", index=False)

In [61]:
stop_words = stopwords.words('english')

In [62]:
def clean_text(doc):
    text = "".join([char for char in doc if char not in string.punctuation and not char.isdigit()])
    
    tokens = nltk.word_tokenize(text.lower())
    
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    
    return ' '.join(tokens)

In [65]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, min_df=2, stop_words='english')

In [53]:
train_data.head()

Unnamed: 0,ArticleId,Text,Category
0,25,mobile audio enters new dimension as mobile ph...,tech
1,292,freeze on anti-spam campaign a campaign by lyc...,tech
2,1916,no seasonal lift for house market a swathe of ...,business
3,428,microsoft takes on desktop search microsoft ha...,tech
4,435,s korean lender faces liquidation creditors of...,business


In [66]:
train_data['Clean_text'] = train_data['Text'].apply(lambda x: clean_text(x))
X_train = train_data['Clean_text'].tolist()
train_features = tfidf_vectorizer.fit_transform(X_train)
y_train = train_data['Category'].to_list()

In [28]:
val_data['Clean_text'] = val_data['Text'].apply(lambda x: clean_text(x))
X_val = val_data['Clean_text'].tolist()
val_features = tfidf_vectorizer.transform(X_val)
y_val = val_data['Category'].to_list()

In [None]:
test_data['Clean_text'] = test_data['Text'].apply(lambda x: clean_text(x))
cleaned_texts = test_data['Clean_text'].tolist()
test_features = tfidf_vectorizer.transform(cleaned_texts)

In [22]:
type(X_train)

list

## Train a classifier and print the metrics on a validation set

In [24]:
clf = MultinomialNB().fit(train_features, y_train)

In [30]:
predicted = clf.predict(val_features)

In [34]:
f1_score(y_val, predicted, average='micro')

0.9697986577181208

In [35]:
f1_score(y_val, predicted, average='macro')

0.9679329681550314

In [85]:
metrics = {
    "f1_macro" : (f1_score, "macro"),
    "accuracy" : (accuracy_score, ""),
    "precision_macro" : (precision_score, "macro"),
    "recall_macro" : (recall_score, "macro"),
}

In [86]:
metrics_list = []
values_list = []

In [87]:
for k, v in metrics.items():
    metrics_list.append(k)
    print(k)
    if v[1] == "":
        values_list.append(v[0](y_val, predicted))
    else:
        print(v[1])
        values_list.append(v[0](y_val, predicted, average=v[1]))

f1_macro
macro
accuracy
precision_macro
macro
recall_macro
macro


In [84]:
metrics_list

['f1_micro', 'accuracy', 'precision_micro', 'recall_micro']

In [88]:
values_list

[0.983424552343841, 0.9839142091152815, 0.9853081421502473, 0.9818761178180193]

## Predict on a test set 

In [21]:
predicted = clf.predict(test_features)

In [24]:
len(test_data['ArticleId'].to_list())

735

In [25]:
len(test_data)

735

In [26]:
len(predicted.tolist())

735

In [28]:
submission = pd.DataFrame({'ArticleId': test_data['ArticleId'].to_list(), 'Category': predicted.tolist()}) 

In [None]:
submission.to_csv('submission.csv', index=False)

## Tests: predict

In [84]:
config = configparser.ConfigParser()
config.read('/Users/modzyuba1/ITMO/big-data-lab1/config.ini')

['/Users/modzyuba1/ITMO/big-data-lab1/config.ini']

In [4]:
PROJECT_ROOT = "/Users/modzyuba1/ITMO/big-data-lab1/"

In [47]:
os.mkdir(os.path.join(PROJECT_ROOT, "tmp"))

FileExistsError: [Errno 17] File exists: '/Users/modzyuba1/ITMO/big-data-lab1/tmp'

In [13]:
train_df = pd.DataFrame({
    "ArticleId": [1, 2],
    "Text": ["Business is good!", "Football is a popular sport"],
    "Category": ["business", "sport"]
})

In [14]:
train_df.head()

Unnamed: 0,ArticleId,Text,Category
0,1,Business is good!,business
1,2,Football is a popular sport,sport


In [51]:
train_df.to_csv(os.path.join(PROJECT_ROOT, "tmp", "train_tmp.csv"), index=False)

In [11]:
path_to_test_data = os.path.join(PROJECT_ROOT, "tmp", "train_tmp.csv")
path_to_vectorizer_ck

pt = os.path.join(PROJECT_ROOT, config['vectorizer']['path_to_vectorizer_ckpt'])
path_to_model_ckpt = os.path.join(PROJECT_ROOT, config['model']['path_to_model_ckpt'])
path_to_submission = os.path.join(PROJECT_ROOT, "tmp", "submission_tmp.csv")

In [5]:
sys.path.append(os.path.join(PROJECT_ROOT, "src"))

In [65]:
path_to_vectorizer_ckpt

'/Users/modzyuba1/ITMO/big-data-lab1/ckpts/tfidf_vectorizer.pkl'

In [12]:
predictor = Predictor()
predictor.predict(path_to_test_data, path_to_model_ckpt, path_to_vectorizer_ckpt, path_to_submission)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/modzyuba1/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/modzyuba1/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/modzyuba1/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [16]:
submission_tmp = pd.read_csv(os.path.join(PROJECT_ROOT, "tmp", "submission_tmp.csv"))

In [20]:
assert submission_tmp.iloc[0]["Category"] == "business" and submission_tmp.iloc[0]["ArticleId"] == 1

In [21]:
assert submission_tmp.iloc[1]["Category"] == "sport" and submission_tmp.iloc[1]["ArticleId"] == 2

## Tests: train

In [43]:
path_to_train_data = os.path.join(PROJECT_ROOT, config['data']['path_to_train_data'])
path_to_vectorizer_ckpt = os.path.join(PROJECT_ROOT, config['tests']['path_to_tmp_vectorizer_ckpt'])
path_to_model_ckpt = os.path.join(PROJECT_ROOT, config['tests']['path_to_tmp_model_ckpt'])

In [44]:
trainer = Trainer()
trainer.train(path_to_train_data, path_to_model_ckpt, path_to_vectorizer_ckpt)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/modzyuba1/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/modzyuba1/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/modzyuba1/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [45]:
assert os.path.exists(path_to_model_ckpt), "Model was not saved after training"

In [46]:
assert os.path.exists(path_to_vectorizer_ckpt), "Vectorizer was not saved after training"

# Tests: validate

In [92]:
path_to_train_data = os.path.join(PROJECT_ROOT, config['data']['path_to_train_data'])
path_to_tmp_val_data = os.path.join(PROJECT_ROOT, config['tests']['path_to_tmp_val_data'])
path_to_vectorizer_ckpt = os.path.join(PROJECT_ROOT, config['vectorizer']['path_to_vectorizer_ckpt'])
path_to_dummy_model_ckpt = os.path.join(PROJECT_ROOT, config['tests']['path_to_dummy_model_ckpt'])
path_to_tmp_metrics = os.path.join(PROJECT_ROOT, config['tests']['path_to_tmp_metrics'])

In [47]:
val_df = pd.DataFrame({
    "ArticleId": [1, 2, 3, 4, 5, 6, 7, 8, 9],
    "Text": [
        "70s was a decade of legendary rock stars",
        "Taylor Swift is a famous American singer",
        "The goverment decided to increase taxes",
        "It is the largest oil company in Japan",
        "I think he is a great guitarist",
        "Tell search engines that your website exists",
        "LG releases a new flagship smartphone next month",
        "He's the starting goalie on the hockey team",
        "She wants to become an excellent long-distance swimmer"
    ],
    "Category": [
        "entertainment",
        "entertainment",
        "politics",
        "business",
        "entertainment",
        "tech",
        "tech",
        "sport",
        "sport",
    ]
})

In [48]:
val_df.to_csv(os.path.join(PROJECT_ROOT, "tmp", "val_tmp.csv"), index=False)

In [50]:
dummy = DummyClassifier(strategy="most_frequent")

In [95]:
dummy.fit(train_features, y_train)

In [68]:
val_df['Clean_text'] = val_df['Text'].apply(lambda x: clean_text(x))
X_val = val_df['Clean_text'].tolist()
val_features = tfidf_vectorizer.transform(X_val)
y_val = val_df['Category'].to_list()

In [96]:
save_ckpt(dummy, path_to_dummy_model_ckpt)

In [106]:
validator = Validator()
validator.validate(path_to_tmp_val_data, path_to_dummy_model_ckpt, path_to_vectorizer_ckpt, path_to_tmp_metrics)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/modzyuba1/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/modzyuba1/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/modzyuba1/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
  _warn_prf(average, modifier, msg_start, len(result))


In [94]:
accuracy_score([
        "entertainment",
        "entertainment",
        "politics",
        "business",
        "entertainment",
        "tech",
        "tech",
        "sport",
        "sport",
    ], ["business" for x in range(9)])

0.1111111111111111

In [99]:
1 / 9

0.1111111111111111

In [90]:
dummy.predict(['sport'])

array(['business'], dtype='<U8')

In [101]:
tmp_metrics = pd.read_csv(path_to_tmp_metrics)

In [102]:
assert tmp_metrics.iloc[0]["metric"] == "f1_macro" and round(tmp_metrics.iloc[0]["value"], 2) == 0.04

In [103]:
assert tmp_metrics.iloc[1]["metric"] == "accuracy" and round(tmp_metrics.iloc[1]["value"], 2) == 0.11

In [104]:
assert tmp_metrics.iloc[2]["metric"] == "precision_macro" and round(tmp_metrics.iloc[2]["value"], 2) == 0.02

In [105]:
assert tmp_metrics.iloc[3]["metric"] == "recall_macro" and round(tmp_metrics.iloc[3]["value"], 2) == 0.2

# Test: preprocess

In [107]:
path_to_tmp_test_data = os.path.join(PROJECT_ROOT, config['tests']['path_to_tmp_test_data'])

In [108]:
preprocessor = Preprocessor()

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/modzyuba1/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/modzyuba1/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/modzyuba1/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [117]:
is_test_false = preprocessor.load_and_preprocess_data(path_to_tmp_test_data, isTest=False)

In [118]:
is_test_true = preprocessor.load_and_preprocess_data(path_to_tmp_test_data, isTest=True)

In [127]:
assert type(is_test_false) is tuple

In [126]:
assert type(is_test_true) is list

In [128]:
X_train, y_train = is_test_false

In [129]:
X_test = is_test_false

In [134]:
assert X_train[0] == 'business good' and X_train[1] == 'football popular sport'