# 0. Import Packages



In [None]:
# Import packages for data set preprocessing, feature engineering, and model training
import pandas as pd
import numpy as np

from google.colab import drive
from glob import glob

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn import model_selection, preprocessing, linear_model, naive_bayes, svm, ensemble, metrics
from sklearn.utils import shuffle

from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
import warnings
from gensim.parsing.preprocessing import remove_stopwords
import nltk



In [None]:
warnings.filterwarnings("ignore")

In [None]:
nltk.download('stopwords') # If needed

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Data Preprocessing
#Combine all data into one .csv file

In [None]:
# Combine five categories into five .csv files respetively by using for-loop

drive.mount('/content/drive')
Category = ['business', 'entertainment', 'politics', 'sport', 'tech',]
root_path= '/content/drive/MyDrive/Data/bbc/'
all= '/*'
for cat in Category:
  data = pd.DataFrame()
  title_list = []
  content_list = []
  main_list=[]
  for files in glob(root_path+cat+all):
    with open(files,'r', encoding="ISO-8859-1") as f:
      content = f.readlines()
      content.remove('\n')
      title_list.append(content[0].strip('\n'))
      content1 = [i.strip('\n') for i in content[1:]]
      content_list.append(content1) 
  data['title'] = pd.Series(title_list)
  data['content'] = pd.Series(content_list)
  data['label'] = cat
  csv_path = root_path + cat+'.csv'
  data.to_csv(root_path +cat+ '.csv',header=True,index=False,encoding='utf_8_sig')




Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Combine 5 individual files into an entire .csv file, which is bbc_news.csv
df0 = pd.read_csv('/content/drive/MyDrive/Data/bbc/business.csv')
print(df0.shape)
df1 = pd.read_csv('/content/drive/MyDrive/Data/bbc/entertainment.csv')
df2 = pd.read_csv('/content/drive/MyDrive/Data/bbc/politics.csv')
df3 = pd.read_csv('/content/drive/MyDrive/Data/bbc/sport.csv')
df4 = pd.read_csv('/content/drive/MyDrive/Data/bbc/tech.csv')
df0 = df0.append(df1,ignore_index=True)
print(df0.shape)
df0 = df0.append(df2,ignore_index=True)
print(df0.shape)
df0 = df0.append(df3,ignore_index=True)
print(df0.shape)
df0 = df0.append(df4,ignore_index=True)
print(df0.shape)
df0.to_csv(root_path+'bbc_news.csv',header=True,index=False,encoding='utf_8_sig')

(510, 3)
(896, 3)
(1313, 3)
(1824, 3)
(2225, 3)


In [None]:
# Read data of bbc_news
bbc_news_df = pd.read_csv('/content/drive/MyDrive/Data/bbc/bbc_news.csv')
bbc_news_df.head()

Unnamed: 0,title,content,label
0,Ask Jeeves tips online ad revival,['Ask Jeeves has become the third leading onli...,business
1,UK economy facing 'major risks',['The UK manufacturing sector will continue to...,business
2,US interest rate rise expected,"[""US interest rates are expected to rise for t...",business
3,Ex-Boeing director gets jail term,['An ex-chief financial officer at Boeing has ...,business
4,Bank set to leave rates on hold,['UK interest rates are set to remain on hold ...,business


In [None]:
mainDF = pd.DataFrame()
mainDF['text'] = bbc_news_df['content']
mainDF['label'] = bbc_news_df['label']

In [None]:
mainDF = shuffle(mainDF)

In [None]:
mainDF['Category']=preprocessing.LabelEncoder().fit_transform(mainDF['label'])
mainDF.head()

Unnamed: 0,text,label,Category
238,['US retail giant Federated Department Stores ...,business,0
344,['US phone company SBC Communications said it ...,business,0
2085,['UK mobile owners continue to break records w...,tech,4
1713,['Listen to the full interview on Sport on Fiv...,sport,3
1608,['Tim Henman was named player of the year for ...,sport,3


# 1.2 Split data into train set and test set

In [None]:
# Split data into train set and test set
train_x, test_x, train_y, test_y = model_selection.train_test_split(mainDF['text'], mainDF['Category'],random_state=42)

In [None]:
# Encode labels 'business', 'entertainment', 'politics', 'sport', and 'tech' as 0, 1, 2, 3, 4 respectively 
#encoder = preprocessing.LabelEncoder()
#train_y = encoder.fit_transform(train_y)
#test_y = encoder.fit_transform(test_y)

In [None]:
#train_y=train_y.reshape(-1, 1)
#test_y=test_y.reshape(-1, 1)

# 2. Feature Engineering
#2.1 Word Frequency Vector as Feature

In [None]:
# Create a CountVectorizer object
vectorizer = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
vectorizer.fit(mainDF['text']) #Learn a vocabulary dictionary of all tokens in the raw documents.
x_count= vectorizer.transform(mainDF['text'])

In [None]:
vectorizer_t = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}', stop_words=nltk.corpus.stopwords.words('english'), max_features=2000)
vectorizer_t.fit(mainDF['text']) #Learn a vocabulary dictionary of all tokens in the raw documents.
x_count_t= vectorizer_t.transform(mainDF['text'])

In [None]:
# Use CountVectorizer to transform documents to document-term matrix.
xtrain_count = vectorizer.transform(train_x)
xtest_count = vectorizer.transform(test_x)
x_count= vectorizer.transform(mainDF['text'])

# 2.2 TF-IDF as Feature

In [None]:
# word tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=2000)
tfidf_vect.fit(mainDF['text'])
x_tfidf=tfidf_vect.transform(mainDF['text'])
xtrain_tfidf = tfidf_vect.transform(train_x)
xtest_tfidf = tfidf_vect.transform(test_x)

In [None]:
# n-gram tf-idf
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=2000)
tfidf_vect_ngram.fit(mainDF['text'])
x_tfidf_ngram=tfidf_vect.transform(mainDF['text'])
xtrain_tfidf_ngram = tfidf_vect_ngram.transform(train_x)
xtest_tfidf_ngram = tfidf_vect_ngram.transform(test_x)

In [None]:
# Character(Part of Speech) tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=2000)
tfidf_vect_ngram_chars.fit(mainDF['text'])
x_tfidf_ngram_char=tfidf_vect.transform(mainDF['text'])
xtrain_tfidf_ngram_chars = tfidf_vect_ngram_chars.transform(train_x)
xtest_tfidf_ngram_chars = tfidf_vect_ngram_chars.transform(test_x)

  "The parameter 'token_pattern' will not be used"


# 2.3 Word-embedding Vector as Feature

# 3. Cross Validation for Model Selection 

In [None]:
# create the models
nb= naive_bayes.MultinomialNB()
lr= linear_model.LogisticRegression()
rf=ensemble.RandomForestClassifier()

# 3 FOLDS

In [None]:
#cross_validation on the three models using Count vector
score=cross_val_score(nb,x_count_t,mainDF['Category'], cv=3)
avg_score = np.mean(score)
print(avg_score)

0.9730330907093568


In [None]:
score=cross_val_score(lr,x_count_t,mainDF['Category'], cv=3)
avg_score = np.mean(score)
print(avg_score)

0.9685383026991765


In [None]:
score=cross_val_score(rf,x_count_t,mainDF['Category'], cv=3)
avg_score = np.mean(score)
print(avg_score)

0.9555043874805543


In [None]:
#cross_validation on the three models using tf-idf vector
score=cross_val_score(nb,x_tfidf,mainDF['Category'], cv=3)
avg_score = np.mean(score)
print(avg_score)

0.9644915384736636


In [None]:
score=cross_val_score(lr,x_tfidf,mainDF['Category'], cv=3)
avg_score = np.mean(score)
print(avg_score)

0.968087853887258


In [None]:
score=cross_val_score(rf,x_tfidf,mainDF['Category'], cv=3)
avg_score = np.mean(score)
print(avg_score)

0.9537038047465058


In [None]:
#cross_validation on the three models using tf-idf-word-ngram vector
score=cross_val_score(nb,x_tfidf_ngram,mainDF['Category'], cv=3)
avg_score = np.mean(score)
print(avg_score)

0.9644915384736636


In [None]:
score=cross_val_score(lr,x_tfidf_ngram,mainDF['Category'], cv=3)
avg_score = np.mean(score)
print(avg_score)

0.968087853887258


In [None]:
score=cross_val_score(rf,x_tfidf_ngram,mainDF['Category'], cv=3)
avg_score = np.mean(score)
print(avg_score)

0.9478655516391367


In [None]:
#cross_validation on the three models using tf-idf-char-ngram vector
score=cross_val_score(nb,x_tfidf_ngram_char,mainDF['Category'], cv=3)
avg_score = np.mean(score)
print(avg_score)

0.9644915384736636


In [None]:
score=score= cross_val_score(lr,x_tfidf_ngram_char,mainDF['Category'], cv=3)
avg_score = np.mean(score)
print(avg_score)

0.968087853887258


In [None]:
score=score=cross_val_score(rf,x_tfidf_ngram_char,mainDF['Category'], cv=3)
avg_score = np.mean(score)
print(avg_score)

0.9537098673146339


# 5 Folds


In [None]:
#cross_validation on the three models using Count vector
score= cross_val_score(nb,x_count,mainDF['Category'], cv=5)
avg_score = np.mean(score)
print(avg_score)

0.9721348314606741


In [None]:
score=cross_val_score(lr,x_count,mainDF['Category'], cv=5)
avg_score = np.mean(score)
print(avg_score)

0.9635955056179775


In [None]:
score=cross_val_score(rf,x_count,mainDF['Category'], cv=5)
avg_score = np.mean(score)
print(avg_score)

0.9591011235955056


In [None]:
#cross_validation on the three models using tf-idf vector
score=cross_val_score(nb,x_tfidf,mainDF['Category'], cv=5)
avg_score = np.mean(score)
print(avg_score)

0.9649438202247189


In [None]:
score=cross_val_score(lr,x_tfidf,mainDF['Category'], cv=5)
avg_score = np.mean(score)
print(avg_score)

0.9689887640449438


In [None]:
score=cross_val_score(rf,x_tfidf,mainDF['Category'], cv=5)
avg_score = np.mean(score)
print(avg_score)

0.9555056179775281


In [None]:
#cross_validation on the three models using tf-idf-word-ngram vector
score=cross_val_score(nb,x_tfidf_ngram,mainDF['Category'], cv=5)
avg_score = np.mean(score)
print(avg_score)

0.9649438202247189


In [None]:
score=cross_val_score(lr,x_tfidf_ngram,mainDF['Category'], cv=5)
avg_score = np.mean(score)
print(avg_score)

0.9689887640449438


In [None]:
score=cross_val_score(rf,x_tfidf_ngram,mainDF['Category'], cv=5)
avg_score = np.mean(score)
print(avg_score)

0.9573033707865168


In [None]:
#cross_validation on the three models using tf-idf-char-ngram vector
score=cross_val_score(nb,x_tfidf_ngram_char,mainDF['Category'], cv=5)
avg_score = np.mean(score)
print(avg_score)

0.9649438202247189


In [None]:
score=cross_val_score(lr,x_tfidf_ngram_char,mainDF['Category'], cv=5)
avg_score = np.mean(score)
print(avg_score)

0.9689887640449438


In [None]:
score=cross_val_score(rf,x_tfidf_ngram_char,mainDF['Category'], cv=5)
avg_score = np.mean(score)
print(avg_score)

0.9573033707865168


# 10 Fold

In [None]:
#cross_validation on the three models using Count vector
score= cross_val_score(nb,x_count,mainDF['Category'], cv=10)
avg_score = np.mean(score)
print(avg_score)

0.9748232537470205


In [None]:
score= cross_val_score(lr,x_count,mainDF['Category'], cv=10)
avg_score = np.mean(score)
print(avg_score)

0.9671958954470167


In [None]:
score= cross_val_score(rf,x_count,mainDF['Category'], cv=10)
avg_score = np.mean(score)
print(avg_score)

0.9564012442936211


In [None]:
#cross_validation on the three models using tf-idf vector
score= cross_val_score(nb,x_tfidf,mainDF['Category'], cv=10)
avg_score = np.mean(score)
print(avg_score)

0.964931523451703


In [None]:
score =cross_val_score(lr,x_tfidf,mainDF['Category'], cv=10)
avg_score = np.mean(score)
print(avg_score)

0.9725811012806528


In [None]:
score =cross_val_score(rf,x_tfidf,mainDF['Category'], cv=10)
avg_score = np.mean(score)
print(avg_score)

0.9599907081969862


In [None]:
#cross_validation on the three models using tf-idf-word-ngram vector
score= cross_val_score(nb,x_tfidf_ngram,mainDF['Category'], cv=10)
avg_score = np.mean(score)
print(avg_score)

0.964931523451703


In [None]:
score =cross_val_score(lr,x_tfidf_ngram,mainDF['Category'], cv=10)
avg_score = np.mean(score)
print(avg_score)

0.9725811012806528


In [None]:
score =cross_val_score(rf,x_tfidf_ngram,mainDF['Category'], cv=10)
avg_score = np.mean(score)
print(avg_score)

0.9595523774895973


In [None]:
#cross_validation on the three models using tf-idf-char-ngram vector
score= cross_val_score(nb,x_tfidf_ngram_char,mainDF['Category'], cv=10)
avg_score = np.mean(score)
print(avg_score)

0.964931523451703


In [None]:
score= cross_val_score(lr,x_tfidf_ngram_char,mainDF['Category'], cv=10)
avg_score = np.mean(score)
print(avg_score)

0.9725811012806528


In [None]:
score=cross_val_score(rf,x_tfidf_ngram_char,mainDF['Category'], cv=10)
avg_score = np.mean(score)
print(avg_score)

0.9595362178321819


# 4. Model Testing and Evaluation  



In [None]:
def train_model(classifier, feature_vector_train, label, feature_vector_test, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)

    # predict the labels on testation dataset
    predictions = classifier.predict(feature_vector_test)

    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
        
    return metrics.accuracy_score(test_y,predictions), metrics.classification_report(test_y,predictions,target_names=['business','entertainment','politics','sport','tech'])


# 4.1 Naive Bayes Classifier

In [None]:
# Naive Bayes on Count Vectors
accuracy,classification_report = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xtest_count)
print("NB, Count Vectors: ", accuracy)
print("NB, Count Vectors:")
print(classification_report)

NB, Count Vectors:  0.9694793536804309
NB, Count Vectors:
               precision    recall  f1-score   support

     business       0.99      0.94      0.97       134
entertainment       0.98      0.95      0.96        95
     politics       0.93      1.00      0.96       103
        sport       1.00      0.99      1.00       131
         tech       0.94      0.97      0.95        94

     accuracy                           0.97       557
    macro avg       0.97      0.97      0.97       557
 weighted avg       0.97      0.97      0.97       557



In [None]:
# Naive Bayes on Word Level TF IDF Vectors
accuracy,classification_report = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xtest_tfidf)
print("NB, WordLevel TF-IDF: ", accuracy)
print("NB, WordLevel TF-IDF: ")
print(classification_report)

NB, WordLevel TF-IDF:  0.9605026929982047
NB, WordLevel TF-IDF: 
               precision    recall  f1-score   support

     business       0.96      0.96      0.96       134
entertainment       0.99      0.91      0.95        95
     politics       0.93      0.99      0.96       103
        sport       0.98      1.00      0.99       131
         tech       0.95      0.94      0.94        94

     accuracy                           0.96       557
    macro avg       0.96      0.96      0.96       557
 weighted avg       0.96      0.96      0.96       557



In [None]:
# Naive Bayes on Ngram Level TF IDF Vectors
accuracy,classification_report = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xtest_tfidf_ngram)
print("NB, N-Gram Vectors: ", accuracy)
print("NB, N-Gram Vectors: ")
print(classification_report)

NB, N-Gram Vectors:  0.9120287253141831
NB, N-Gram Vectors: 
               precision    recall  f1-score   support

     business       0.89      0.93      0.91       134
entertainment       0.96      0.76      0.85        95
     politics       0.94      0.94      0.94       103
        sport       0.93      0.98      0.96       131
         tech       0.85      0.91      0.88        94

     accuracy                           0.91       557
    macro avg       0.91      0.90      0.91       557
 weighted avg       0.91      0.91      0.91       557



In [None]:
# Naive Bayes on Character Level TF IDF Vectors
accuracy,classification_report = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram_chars, train_y, xtest_tfidf_ngram_chars)
print("NB, CharLevel Vectors: ", accuracy)
print("NB, CharLevel Vectors: ")
print(classification_report)

NB, CharLevel Vectors:  0.9120287253141831
NB, CharLevel Vectors: 
               precision    recall  f1-score   support

     business       0.91      0.96      0.94       134
entertainment       0.98      0.63      0.77        95
     politics       0.86      0.98      0.91       103
        sport       0.90      1.00      0.95       131
         tech       0.95      0.93      0.94        94

     accuracy                           0.91       557
    macro avg       0.92      0.90      0.90       557
 weighted avg       0.92      0.91      0.91       557



# 4.2 Linear Classifier

In [None]:
# Linear Classifier on Count Vectors
accuracy,classification_report = train_model(linear_model.LogisticRegressionCV(), xtrain_count, train_y, xtest_count)
print("LR, Count Vectors: ", accuracy)
print("LR, Count Vectors: ")
print(classification_report)

LR, Count Vectors:  0.9605026929982047
LR, Count Vectors: 
               precision    recall  f1-score   support

     business       0.96      0.96      0.96       134
entertainment       0.99      0.96      0.97        95
     politics       0.92      0.95      0.94       103
        sport       0.98      1.00      0.99       131
         tech       0.95      0.93      0.94        94

     accuracy                           0.96       557
    macro avg       0.96      0.96      0.96       557
 weighted avg       0.96      0.96      0.96       557



In [None]:
# Linear Classifier on Word Level TF IDF Vectors
accuracy,classification_report = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xtest_tfidf)
print("LR, WordLevel TF-IDF: ", accuracy)
print("LR, WordLevel TF-IDF: ")
print(classification_report)

LR, WordLevel TF-IDF:  0.9712746858168761
LR, WordLevel TF-IDF: 
               precision    recall  f1-score   support

     business       0.98      0.97      0.98       134
entertainment       0.99      0.93      0.96        95
     politics       0.95      0.97      0.96       103
        sport       0.99      1.00      1.00       131
         tech       0.93      0.98      0.95        94

     accuracy                           0.97       557
    macro avg       0.97      0.97      0.97       557
 weighted avg       0.97      0.97      0.97       557



In [None]:
# Linear Classifier on Ngram Level TF IDF Vectors
accuracy,classification_report = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram, train_y, xtest_tfidf_ngram)
print("LR, N-Gram Vectors: ", accuracy)
print("LR, N-Gram Vectors: ")
print(classification_report)

LR, N-Gram Vectors:  0.9245960502692998
LR, N-Gram Vectors: 
               precision    recall  f1-score   support

     business       0.90      0.96      0.93       134
entertainment       0.95      0.81      0.87        95
     politics       0.93      0.93      0.93       103
        sport       0.95      0.98      0.96       131
         tech       0.90      0.91      0.91        94

     accuracy                           0.92       557
    macro avg       0.93      0.92      0.92       557
 weighted avg       0.93      0.92      0.92       557



In [None]:
# Linear Classifier on Character Level TF IDF Vectors
accuracy,classification_report = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram_chars, train_y, xtest_tfidf_ngram_chars)
print("LR, CharLevel Vectors: ", accuracy)
print("LR, CharLevel Vectors: ")
print(classification_report)

LR, CharLevel Vectors:  0.9425493716337523
LR, CharLevel Vectors:                 precision    recall  f1-score   support

     business       0.95      0.92      0.94       134
entertainment       0.97      0.87      0.92        95
     politics       0.90      0.97      0.93       103
        sport       0.97      1.00      0.98       131
         tech       0.92      0.94      0.93        94

     accuracy                           0.94       557
    macro avg       0.94      0.94      0.94       557
 weighted avg       0.94      0.94      0.94       557



# 4.3 Random Forest classifier

In [None]:
# RF on Count Vectors
accuracy,classification_report = train_model(ensemble.RandomForestClassifier(), xtrain_count, train_y, xtest_count)
print("RF, Count Vectors: ", accuracy)
print("RF, Count Vectors: ")
print(classification_report)

RF, Count Vectors:  0.9515260323159784
RF, Count Vectors: 
               precision    recall  f1-score   support

     business       0.95      0.96      0.96       134
entertainment       0.96      0.86      0.91        95
     politics       0.93      0.96      0.95       103
        sport       0.96      0.99      0.97       131
         tech       0.96      0.96      0.96        94

     accuracy                           0.95       557
    macro avg       0.95      0.95      0.95       557
 weighted avg       0.95      0.95      0.95       557



In [None]:
# RF on Word Level TF IDF Vectors
accuracy,classification_report = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf, train_y, xtest_tfidf)
print("RF, WordLevel TF-IDF: ", accuracy)
print("RF, WordLevel TF-IDF: ")
print(classification_report)

RF, WordLevel TF-IDF:  0.9587073608617595
RF, WordLevel TF-IDF: 
               precision    recall  f1-score   support

     business       0.95      0.97      0.96       134
entertainment       0.99      0.89      0.94        95
     politics       0.94      0.97      0.96       103
        sport       0.97      0.99      0.98       131
         tech       0.95      0.95      0.95        94

     accuracy                           0.96       557
    macro avg       0.96      0.95      0.96       557
 weighted avg       0.96      0.96      0.96       557



In [None]:
# RF on Ngram Level TF IDF Vectors
accuracy,classification_report = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf_ngram, train_y, xtest_tfidf_ngram)
print("RF, N-Gram Vectors: ", accuracy)
print("RF, N-Gram Vectors: ")
print(classification_report)

RF, N-Gram Vectors:  0.8671454219030521
RF, N-Gram Vectors: 
               precision    recall  f1-score   support

     business       0.82      0.94      0.88       134
entertainment       0.94      0.69      0.80        95
     politics       0.93      0.83      0.88       103
        sport       0.84      0.97      0.90       131
         tech       0.87      0.84      0.85        94

     accuracy                           0.87       557
    macro avg       0.88      0.85      0.86       557
 weighted avg       0.87      0.87      0.86       557



In [None]:
# RF on Character Level TF IDF Vectors
accuracy,classification_report = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf_ngram_chars, train_y, xtest_tfidf_ngram_chars)
print("RF, CharLevel Vectors: ", accuracy)
print("RF, CharLevel Vectors: ")
print(classification_report)

RF, CharLevel Vectors:  0.9371633752244165
RF, CharLevel Vectors: 
               precision    recall  f1-score   support

     business       0.95      0.93      0.94       134
entertainment       0.97      0.88      0.92        95
     politics       0.90      0.92      0.91       103
        sport       0.94      0.99      0.97       131
         tech       0.92      0.94      0.93        94

     accuracy                           0.94       557
    macro avg       0.94      0.93      0.93       557
 weighted avg       0.94      0.94      0.94       557

