# Layer 1
Predicting relevant and irrelevant topics

In [18]:
import warnings

import numpy as np
import pandas as pd
import sklearn
from joblib import load, dump
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score # get a specific score of a model using CV
from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedShuffleSplit
# from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS # a list of english stop words
from sklearn.preprocessing import LabelEncoder

from sklearn.exceptions import ConvergenceWarning

In [9]:
df_train = pd.read_csv("../training.csv")

df_test = pd.read_csv("../test.csv")

## encode labels to relevant: 0, irrelevent: 1

In [10]:
text_train = df_train.article_words
label_train = df_train.topic


y_train = np.where(df_train.topic == "IRRELEVANT", 1, 0)
y_train.sum()

4734

In [11]:
# Insert the encoded column into original dataframe
df_train["is_irrelevent"] = y_train
df_train.head()

Unnamed: 0,article_number,article_words,topic,is_irrelevent
0,1,"open,absent,cent,cent,cent,stock,inflow,rate,k...",FOREX MARKETS,0
1,2,"morn,stead,end,end,day,day,day,patch,patch,pat...",MONEY MARKETS,0
2,3,"socc,socc,world,world,recent,law,fifa,fifa,fif...",SPORTS,0
3,4,"open,forint,forint,forint,forint,cent,cent,ste...",FOREX MARKETS,0
4,5,"morn,complet,weekend,minut,minut,minut,arrow,d...",IRRELEVANT,1


In [16]:
vect = CountVectorizer(min_df=5)
X_train = vect.fit_transform(text_train)
X_train

<9500x9495 sparse matrix of type '<class 'numpy.int64'>'
	with 719744 stored elements in Compressed Sparse Row format>

In [13]:
%%time
# Ingore convergence warnings
warnings.filterwarnings("ignore", category=ConvergenceWarning,
                        module="sklearn")

# Get accuracy scores for each of 5-fold CV
# b_jobs specifies how many processors are used in paralle, -1 means all

accs = cross_val_score(LogisticRegression(n_jobs=-1), X_train, y_train, cv=5)
print(f"Mean accuracy (std): {np.mean(accs): .3f} ({np.std(accs): .3f})")

Mean accuracy (std):  0.832 ( 0.004)
CPU times: user 164 ms, sys: 24.8 ms, total: 189 ms
Wall time: 3.82 s


In [14]:
%%time
param_grid = {'C': np.geomspace(0.001,10,num=5,endpoint=True)}

grid = GridSearchCV(LogisticRegression(n_jobs=-1), param_grid=param_grid, cv=5, n_jobs=-1)
grid.fit(X_train, y_train)
print(f"best parameter: {grid.best_params_}")
print(f"best 5-fold CV score (accuracy): {grid.best_score_: .3f}")

best parameter: {'C': 0.01}
best 5-fold CV score (accuracy):  0.859
CPU times: user 75.3 ms, sys: 15.2 ms, total: 90.5 ms
Wall time: 4.93 s


In [17]:
%%time
text_test = df_test.article_words # training set containing text content
label_test = df_test.topic

# encode labels to relevant: 0, irrelevent: 1
y_test = np.where(df_test.topic == "IRRELEVANT", 1, 0)
y_test.sum()

X_test = vect.transform(text_test)
acc_test = grid.score(X_test, y_test)
print(f"Logistic regression accuracy on test set: {acc_test:.3f}")

Logistic regression accuracy on test set: 0.880
CPU times: user 44.2 ms, sys: 3.97 ms, total: 48.2 ms
Wall time: 53.1 ms


## Test on the Best LogisticRegressor
See the cell below for location and usage of the model

In [19]:
with open("Models/best_logi.joblib", "rb") as f:
    best_logi = load(f)
best_logi



Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=5, ngram_range=(1, 1), norm=None,
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('LR',
                 LogisticRegression(C=0.001, class_weight='balanced',
                                    dual=False, fit_intercept=True,
          

In [20]:
best_logi.fit(text_train, y_train)
train_acc = best_logi.score(text_train, y_train)
test_acc = best_logi.score(text_test, y_test)
print(f"Training accuracy: {train_acc:.3f}\n"
      f"Test accuracy: {test_acc:.3f}")

Training accuracy: 0.942
Test accuracy: 0.878
