In [1]:
import re
import nltk
import pickle
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/hareeshbahuleyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hareeshbahuleyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/hareeshbahuleyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
dataset = pd.read_csv('data/bbc_news_train_val.csv')
print(f'Number of data instances = {dataset.shape[0]}')
dataset.head(3)

Number of data instances = 1400


Unnamed: 0,ArticleId,Text,Category,CategoryId
0,1833,worldcom ex-boss launches defence lawyers defe...,business,0
1,154,german business confidence slides german busin...,business,0
2,1101,bbc poll indicates economic gloom citizens in ...,business,0


In [4]:
dataset['Category'].value_counts()

sport            329
business         320
politics         253
entertainment    252
tech             246
Name: Category, dtype: int64

In [5]:
dataset['CategoryId'] = dataset['Category'].factorize()[0]
dataset.head()

Unnamed: 0,ArticleId,Text,Category,CategoryId
0,1833,worldcom ex-boss launches defence lawyers defe...,business,0
1,154,german business confidence slides german busin...,business,0
2,1101,bbc poll indicates economic gloom citizens in ...,business,0
3,1976,lifestyle governs mobile choice faster bett...,tech,1
4,917,enron bosses in $168m payout eighteen former e...,business,0


In [6]:
# category mappings
category_mapping_df = dataset[['Category', 'CategoryId']].drop_duplicates().sort_values('CategoryId')
category_mapping_dict = dict(zip(category_mapping_df.CategoryId, category_mapping_df.Category))
category_mapping_dict

{0: 'business', 1: 'tech', 2: 'politics', 3: 'sport', 4: 'entertainment'}

## Text Preprocessing

1. Remove HTML Tags
2. Remove Special Characters
3. Convert to Lower Case
4. Remove Stopwords
5. Word Lemmatization

In [7]:
# moved to separate file 
# so that pickling during training and unpickling during inference does not run into issues
# issues only when spinning up flask app
from text_preprocessing import TextPreprocessor

In [8]:
# check if the text preprocessor works as expected
tp = TextPreprocessor()
tp.transform(['the german# chancellor <a/> announces tax hikes'])

['german chancellor announces tax hike']

## ML Model

#### Train Test Split

In [9]:
X_text = list(dataset['Text'])
y_label = list(dataset['CategoryId'])
X_train, X_val, y_train, y_val = train_test_split(X_text, y_label, test_size=0.1, random_state=10, shuffle=True)

#### TFIDF Vectorizer

In [10]:
tfidf_vec = TfidfVectorizer(max_features=5000)
X_train = tfidf_vec.fit_transform(X_train).toarray()
print('Input Shape ', X_train.shape)

y_train = np.array(y_train)
print('Output Shape ', y_train.shape)

Input Shape  (1260, 5000)
Output Shape  (1260,)


#### Multinomial Naive Bayes

In [11]:
ml_model = MultinomialNB()
ml_model.fit(X_train, y_train)

## Metrics on Validation Split

In [12]:
X_val = tfidf_vec.transform(X_val).toarray()
y_pred = ml_model.predict(X_val)
y_val = np.array(y_val)

In [13]:
print(f"Overall Accuracy on Validation Set = {accuracy_score(y_val, y_pred):.3f}\n")
print("Classification Report: \n", classification_report(y_val, y_pred))

Overall Accuracy on Validation Set = 0.964

Classification Report: 
               precision    recall  f1-score   support

           0       1.00      0.97      0.98        29
           1       0.92      0.96      0.94        25
           2       0.90      0.95      0.93        20
           3       0.97      1.00      0.98        29
           4       1.00      0.95      0.97        37

    accuracy                           0.96       140
   macro avg       0.96      0.96      0.96       140
weighted avg       0.97      0.96      0.96       140



## Sklearn Pipeline

In [14]:
text_clf = Pipeline([
    ('text_preprocessor', TextPreprocessor()),
    ('tfidf_vectorizer', TfidfVectorizer(max_features=5000)),
    ('mnb_classifier', MultinomialNB())
])

In [15]:
# Data
X_text = list(dataset['Text'])
y_label = list(dataset['CategoryId'])
X_train, X_val, y_train, y_val = train_test_split(X_text, y_label, test_size=0.1, random_state=10, shuffle=True)

# Model
text_clf.fit(X_train, y_train)

In [16]:
def model_evaluate(text_clf, X, y_true):
    y_pred = text_clf.predict(X)
    y_true = np.array(y_true)
    print(f"Overall Accuracy on Validation Set = {accuracy_score(y_true, y_pred):.3f}\n")
    print("Classification Report: \n", classification_report(y_true, y_pred))

In [17]:
model_evaluate(text_clf, X_val, y_val)

Overall Accuracy on Validation Set = 0.979

Classification Report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        29
           1       0.96      0.96      0.96        25
           2       0.95      0.95      0.95        20
           3       0.97      1.00      0.98        29
           4       1.00      0.97      0.99        37

    accuracy                           0.98       140
   macro avg       0.98      0.98      0.98       140
weighted avg       0.98      0.98      0.98       140



## Save and Load Pipeline

In [18]:
def save_model_pipeline(model_pipeline, path='saved_models/model.pkl'):
    with open(path, 'wb') as fio:
        pickle.dump(model_pipeline, fio)
    print(f'Saved model successfully at {path}')
    
def load_model_pipeline(path='saved_models/model.pkl'):
    with open(path, 'rb') as fio:
        model_pipeline = pickle.load(fio)
    print(f'Loaded model successfully from {path}')
    return model_pipeline

In [19]:
save_model_pipeline(text_clf)
model_pipeline = load_model_pipeline()

Saved model successfully at saved_models/model.pkl
Loaded model successfully from saved_models/model.pkl


## Predictor Function

In [20]:
def predict(model_pipeline, texts):
    predictions = model_pipeline.predict(texts)
    pred_to_label = {0: 'business', 1: 'tech', 2: 'politics', 3: 'sport', 4: 'entertainment'}
    
    # Make a list of texts with their respective predictions
    data = []
    for t, pred in zip(texts, predictions):
        data.append((t, pred, pred_to_label[pred]))
        
    return data

In [21]:
if __name__ =='__main__':
    # Random texts to be classified
    texts=['avatar in 3d set to be released at the cinemas mid december',
           'cryptocurrency crash and the bear market are consequences of the fed rate hike',
           'argentina defeat france to become world champions in one of the most exciting sporting events that the world has ever witnessed',
          ]
    
    predictions = predict(model_pipeline, texts)
    print(predictions)

[('avatar in 3d set to be released at the cinemas mid december', 4, 'entertainment'), ('cryptocurrency crash and the bear market are consequences of the fed rate hike', 0, 'business'), ('argentina defeat france to become world champions in one of the most exciting sporting events that the world has ever witnessed', 3, 'sport')]
