# Multinomial Naive Bayes Methodology

Multinomial Naive Bayes is a common method used in text classifiation and is one of the simplest and most practical learning methods

In [50]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

data = pd.read_csv('training.csv')

print(data.head())

   article_number                                      article_words  \
0               1  open,absent,cent,cent,cent,stock,inflow,rate,k...   
1               2  morn,stead,end,end,day,day,day,patch,patch,pat...   
2               3  socc,socc,world,world,recent,law,fifa,fifa,fif...   
3               4  open,forint,forint,forint,forint,cent,cent,ste...   
4               5  morn,complet,weekend,minut,minut,minut,arrow,d...   

           topic  
0  FOREX MARKETS  
1  MONEY MARKETS  
2         SPORTS  
3  FOREX MARKETS  
4     IRRELEVANT  


### Baseline Model

In [51]:
#Count Vectorizer tokenizes and counts the word occurance for each document and builds a vocabulary of known words
count = CountVectorizer()
test_data = count.fit_transform(data["article_words"])

#Train / Development Split
x_train = test_data[:9000]
y_train = data["topic"][:9000]

x_dev = test_data[9000:]
y_dev = data["topic"][9000:]

nb = MultinomialNB()
nb.fit(x_train, y_train)
y_pred = nb.predict(x_dev)

print("Accuracy score for base model:", accuracy_score(y_pred, y_dev))

Accuracy score for base model: 0.732


### Feature Extraction Tuning

In [52]:
def feature_test(vector):
    test_data = vector.fit_transform(data["article_words"])

    x_train = test_data[:9000]
    y_train = data["topic"][:9000]
    x_dev = test_data[9000:]
    y_dev = data["topic"][9000:]

    nb = MultinomialNB()
    nb.fit(x_train, y_train)
    y_pred = nb.predict(x_dev)
    print("Number of features: ", x_train.shape[1])
    print("Accuracy score:", accuracy_score(y_pred, y_dev))

#Some words will be very present (e.g. “the”, “a”, “is” in English) 
#hence carrying very little meaningful information about the actual contents of the document.
#tf–idf transformation gives weight to words depeding on their frequency 
vector = TfidfVectorizer()
feature_test(vector)

#Stop words are words like “and”, “the”, “him”, which are presumed to be uninformative in representing the content of a text
vector = CountVectorizer(stop_words="english")
feature_test(vector)

#Unigrams and bigrams (groups of 2 words) can be used
vector = CountVectorizer(ngram_range=(1,2))
feature_test(vector)

#min_df: ignore terms that have a document frequency strictly lower than the given threshold. Default 1
vector = CountVectorizer(min_df = 2)
feature_test(vector)

#final chosen
vector = CountVectorizer(stop_words="english", ngram_range=(1,2), min_df = 2)
feature_test(vector)

Number of features:  35822
Accuracy score: 0.676
Number of features:  35725
Accuracy score: 0.736
Number of features:  378659
Accuracy score: 0.754
Number of features:  18376
Accuracy score: 0.73
Number of features:  113355
Accuracy score: 0.76


### Parameter Tuning

In [53]:
from sklearn.model_selection import GridSearchCV
param_grid = {'alpha': np.asarray(range(1,10,1))}
grid = GridSearchCV(MultinomialNB(), param_grid, cv=5)
grid.fit(x_train, y_train)
print("Best cross-validation score: ", grid.best_score_)
print("Best parameters: ", grid.best_params_)

Best cross-validation score:  0.741111111111111
Best parameters:  {'alpha': 2}


### Final Results for Selected Model on Test Data

In [54]:
#Chosen vectorizer 
vector = CountVectorizer(stop_words="english", ngram_range=(1,2), min_df = 2)
#Transform and fit training data
x_train = vector.fit_transform(data["article_words"])
y_train = data["topic"]

#Build Final Model
final_model = MultinomialNB(2)
final_model.fit(x_train, y_train)

#Import test data
testdata = pd.read_csv('test.csv')
#Transform test data
x_test = vector.transform(testdata["article_words"])
y_test = testdata["topic"]
#Vocabulary size
print("Number of features: ", x_test.shape[1])
#Predict
y_pred = final_model.predict(x_test)

#Cross Validation Results
from sklearn.model_selection import cross_val_score
scores = cross_val_score(final_model, x_train, y_train, cv=5)
print("Cross Val Scores: ", scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

#Classification Report
print(classification_report(y_pred, y_test))

Number of features:  113355
Cross Val Scores:  [0.74105263 0.74157895 0.74052632 0.74368421 0.74894737]
Accuracy: 0.74 (+/- 0.01)
                                  precision    recall  f1-score   support

      ARTS CULTURE ENTERTAINMENT       0.33      1.00      0.50         1
BIOGRAPHIES PERSONALITIES PEOPLE       0.07      1.00      0.12         1
                         DEFENCE       0.46      1.00      0.63         6
                DOMESTIC MARKETS       0.00      0.00      0.00         0
                   FOREX MARKETS       0.06      0.38      0.11         8
                          HEALTH       0.36      0.83      0.50         6
                      IRRELEVANT       0.84      0.81      0.83       276
                   MONEY MARKETS       0.91      0.46      0.61       138
          SCIENCE AND TECHNOLOGY       0.00      0.00      0.00         0
                  SHARE LISTINGS       0.00      0.00      0.00         0
                          SPORTS       1.00      0.94  

  _warn_prf(average, modifier, msg_start, len(result))


We can see above that some classes are not being classified at all. They are instead being misclassified, this may be due to class imbalance. We fix this by oversampling the classes with fewer instances.

In [55]:
#Oversample
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
x_train, y_train = oversample.fit_resample(x_train, y_train)

#Build Final Model
final_model = MultinomialNB(2)
final_model.fit(x_train, y_train)

#Import test data
testdata = pd.read_csv('test.csv')
#Transform test data
x_test = vector.transform(testdata["article_words"])
y_test = testdata["topic"]
#Predict
y_pred = final_model.predict(x_test)

#Classification Report
print(classification_report(y_pred, y_test))

                                  precision    recall  f1-score   support

      ARTS CULTURE ENTERTAINMENT       0.67      0.33      0.44         6
BIOGRAPHIES PERSONALITIES PEOPLE       0.47      0.64      0.54        11
                         DEFENCE       0.69      0.60      0.64        15
                DOMESTIC MARKETS       0.50      0.14      0.22         7
                   FOREX MARKETS       0.65      0.47      0.54        66
                          HEALTH       0.64      0.64      0.64        14
                      IRRELEVANT       0.73      0.90      0.81       216
                   MONEY MARKETS       0.67      0.51      0.58        91
          SCIENCE AND TECHNOLOGY       0.33      0.33      0.33         3
                  SHARE LISTINGS       0.71      0.56      0.63         9
                          SPORTS       0.98      0.95      0.97        62

                        accuracy                           0.73       500
                       macro avg    