In [0]:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder  
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix

In [0]:
################################################# Data Prep #################################################
df = pd.read_csv('bbc-text.csv')
## Swap the columns order for clarity 
cols = list(df)
## Move last element to first element
cols = cols[-1:] + cols[:-1]
df = df[cols]
## Numeric categorisation for each label 
le = LabelEncoder()
df['label'] = le.fit_transform(df['category'])
print(df[['category', 'label']])

In [0]:
################################################# Training #################################################
## Transform each article's text to a feature vector using Count Vectorizer, TFIDF is another approach 
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'])
cv = CountVectorizer(strip_accents='ascii', lowercase=True, stop_words='english')
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)

## Look at the top word counts 
word_freq_df = pd.DataFrame(X_train_cv.toarray(), columns=cv.get_feature_names())
top_words_df = pd.DataFrame(word_freq_df.sum()).sort_values(0, ascending=False)

## Start up the naive bae
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train_cv, y_train)
predictions = naive_bayes.predict(X_test_cv)

print('Accuracy Score: {}'.format(accuracy_score(y_test, predictions)))
print('Precision score: {}'.format(precision_score(y_test, predictions, average='weighted')))
print('Recall score: {}'.format(recall_score(y_test, predictions, average='weighted')))
#print(predictions)

In [0]:
testing_pred = []
for i in range(len(X_test)): 
  if predictions[i] == 0: 
    testing_pred.append('Business')
  elif predictions[i] == 1: 
    testing_pred.append('Entertainment')
  elif predictions[i] == 2: 
    testing_pred.append('Politics')
  elif predictions[i] == 3: 
    testing_pred.append('Sport')
  else: 
    testing_pred.append('Tech')

check = pd.DataFrame({'actual_label': list(y_test), 'predictions': testing_pred, 'text': list(X_test)})
check.replace(to_replace=0, value='Business', inplace=True)
check.replace(to_replace=1, value='Entertainment', inplace=True)
check.replace(to_replace=2, value='Politics', inplace=True)
check.replace(to_replace=3, value='Sport', inplace=True)
check.replace(to_replace=4, value='Tech', inplace=True)
check['wrong_preds'] = np.where(check['actual_label'] != check['predictions'], 1, 0)
check.loc[check['wrong_preds'] == 1]