In [74]:
import os
from tqdm.notebook import tqdm,tnrange
from bs4 import BeautifulSoup as bs
from nltk.stem import PorterStemmer,SnowballStemmer,WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords
import json
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
#punkt and stopqwords
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/darshparikh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/darshparikh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [75]:
df = pd.read_csv('BBC News Train.csv')
test = pd.read_csv('BBC News Test.csv')
df.head()

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


In [76]:
len(df['Category'].value_counts())

5

In [77]:
def preprocess(text):
    text = str(text)
    text = text.lower()
    #perform tokenization
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # remove punctuations but not hyphen separated words
    tokens = [w for w in tokens if w.isalpha() or '-' in w]
    # remove words with numbers
    tokens = [w for w in tokens if not any(c.isdigit() for c in w)]
    # Remove blank space tokens
    tokens = [w for w in tokens if w.strip()]
    # remove single character tokens
    tokens = [w for w in tokens if len(w) > 1]

    #join tokens back to sentence
    text = ' '.join(tokens)
    
    #perform lemmatization
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(w) for w in text.split()])
    return text

    


In [78]:
df['Text'] = df['Text'].apply(preprocess)
text = df['Text']
text.head()

0    worldcom ex-boss launch defence lawyer defendi...
1    german business confidence slide german busine...
2    bbc poll indicates economic gloom citizen majo...
3    lifestyle governs mobile choice faster better ...
4    enron boss payout eighteen former enron direct...
Name: Text, dtype: object

In [79]:
word_dict = {}

# Loop through the dataset
for i in range(len(df)):
    # Get the text and category of the current row
    text = df['Text'].iloc[i]
    category = df['Category'].iloc[i]
    
    # Tokenize the text into words
    words = text.split()
    
    # Loop through the words and update the dictionary
    for word in words:
        if word not in word_dict:
            # If the word is not already in the dictionary, add it with an empty dictionary as its value
            word_dict[word] = {}
        
        if category not in word_dict[word]:
            # If the category is not already in the dictionary for the current word, add it with a count of 1
            word_dict[word][category] = 1
        else:
            # If the category is already in the dictionary for the current word, increment its count by 1
            word_dict[word][category] += 1

word_dict

{'worldcom': {'business': 54},
 'ex-boss': {'business': 2},
 'launch': {'business': 23,
  'entertainment': 14,
  'tech': 58,
  'sport': 5,
  'politics': 17},
 'defence': {'business': 17,
  'sport': 37,
  'entertainment': 1,
  'tech': 9,
  'politics': 15},
 'lawyer': {'business': 26,
  'entertainment': 15,
  'tech': 8,
  'politics': 19,
  'sport': 7},
 'defending': {'business': 2, 'sport': 17, 'tech': 1, 'politics': 4},
 'former': {'business': 72,
  'entertainment': 52,
  'politics': 81,
  'sport': 85,
  'tech': 15},
 'chief': {'business': 154,
  'tech': 45,
  'sport': 24,
  'politics': 57,
  'entertainment': 15},
 'bernie': {'business': 8},
 'ebbers': {'business': 45},
 'battery': {'business': 1, 'tech': 23, 'sport': 3, 'entertainment': 1},
 'fraud': {'business': 64, 'tech': 15, 'politics': 20, 'entertainment': 1},
 'charge': {'business': 61,
  'tech': 17,
  'entertainment': 15,
  'sport': 47,
  'politics': 30},
 'called': {'business': 30,
  'politics': 60,
  'sport': 15,
  'tech': 65,

In [80]:
word_dict = {}

# Count the number of documents in each category
class_count = {}
for i in range(len(df)):
    category = df['Category'].iloc[i]
    if category not in class_count:
        class_count[category] = 1
    else:
        class_count[category] += 1

class_count

{'business': 336,
 'tech': 261,
 'politics': 274,
 'sport': 346,
 'entertainment': 273}

In [81]:
word_dict = {}
class_count = {}
for i in range(len(df)):
    category = df['Category'].iloc[i]
    if category not in class_count:
        class_count[category] = 1
    else:
        class_count[category] += 1

# Loop through the dataset
for i in range(len(df)):
    # Get the text and category of the current row
    text = df['Text'].iloc[i]
    category = df['Category'].iloc[i]
    
    # Tokenize the text into words
    words = text.split()
    
    # Loop through the words and update the dictionary
    for word in words:
        if word not in word_dict:
            # If the word is not already in the dictionary, add it with an empty dictionary as its value
            word_dict[word] = {}
        
        if category not in word_dict[word]:
            # If the category is not already in the dictionary for the current word, add it with a count of 1
            word_dict[word][category] = 1
        else:
            # If the category is already in the dictionary for the current word, increment its count by 1
            word_dict[word][category] += 1

# Calculate the inverse class frequency for each word and category
for word in word_dict:
    cf = len(word_dict[word])
    icf = math.log10(len(class_count) / cf)
    word_dict[word]['ICF'] = icf

# Calculate the TF-ICF score for each word and category
for word in word_dict:
    for category in word_dict[word]:
        if category != 'ICF':
            tf = word_dict[word][category]
            icf = word_dict[word]['ICF']
            tf_icf = tf * icf
            word_dict[word][category] = tf_icf

In [82]:
print(word_dict)



In [83]:
articles = df['ArticleId'].to_list()
text = df['Text'].to_list()
categories = df['Category'].to_list()
# Create a dictionary to store the TF-ICF scores for each article
tf_icf_scores = {}
index = 0
for article in articles:
    if article not in tf_icf_scores.keys():
        tf_icf_scores[article] = {'Answer': categories[index]}
    for i in text[index].split():
        if i in word_dict.keys():
            for j in word_dict[i].keys():
                if j != 'ICF':
                    if j not in tf_icf_scores[article].keys():
                        tf_icf_scores[article][j] = word_dict[i][j]
                    else:
                        tf_icf_scores[article][j] += word_dict[i][j]
    index += 1

tf_icf_scores

{1833: {'Answer': 'business',
  'business': 856.162083936406,
  'entertainment': 17.188244143583006,
  'tech': 56.84745842050777,
  'sport': 13.93478922063218,
  'politics': 47.69877471344847},
 154: {'Answer': 'business',
  'business': 480.02593850686253,
  'sport': 21.797181270110148,
  'politics': 127.13694487454282,
  'entertainment': 8.770529807604943,
  'tech': 78.77780660126764},
 1101: {'Answer': 'business',
  'business': 387.0199028146262,
  'tech': 29.062969928134684,
  'politics': 105.86306296036418,
  'entertainment': 9.640196039109938,
  'sport': 12.909978904102802},
 1976: {'Answer': 'tech',
  'tech': 1365.4996790937612,
  'entertainment': 130.5114674631093,
  'politics': 48.63781234149487,
  'business': 133.58415032702732,
  'sport': 12.414005821773582},
 917: {'Answer': 'business',
  'business': 311.23783177408825,
  'tech': 41.739268236592686,
  'politics': 26.764749510802073,
  'sport': 20.4181546570355,
  'entertainment': 23.082659123629526},
 1582: {'Answer': 'polit

In [84]:
Modified_df = pd.DataFrame.from_dict(tf_icf_scores,orient='index')
Modified_df.head()

Unnamed: 0,Answer,business,entertainment,tech,sport,politics
1833,business,856.162084,17.188244,56.847458,13.934789,47.698775
154,business,480.025939,8.77053,78.777807,21.797181,127.136945
1101,business,387.019903,9.640196,29.06297,12.909979,105.863063
1976,tech,133.58415,130.511467,1365.499679,12.414006,48.637812
917,business,311.237832,23.082659,41.739268,20.418155,26.76475


In [85]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn import metrics
y = Modified_df['Answer']
X = Modified_df.drop(['Answer'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
#Build multinomial naive bayes model
clf = GaussianNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
#print classification report
print(metrics.classification_report(y_test, y_pred))

print('-------------------------------------------')

def calculate_category_prob(train):
    category_prob = {}
    total = len(train)
    for i in train.value_counts().index:
        category_prob[i] = train.value_counts()[i]/total
    return category_prob
prob = calculate_category_prob(y_train)
print('Category probabilities: ',prob)


Accuracy: 0.9440715883668904
               precision    recall  f1-score   support

     business       0.88      0.95      0.92       103
entertainment       0.96      0.98      0.97        89
     politics       0.99      0.81      0.89        81
        sport       0.98      1.00      0.99        97
         tech       0.94      0.96      0.95        77

     accuracy                           0.94       447
    macro avg       0.95      0.94      0.94       447
 weighted avg       0.95      0.94      0.94       447

-------------------------------------------
Category probabilities:  {'sport': 0.23873441994247363, 'business': 0.2233940556088207, 'politics': 0.1850431447746884, 'entertainment': 0.17641418983700863, 'tech': 0.17641418983700863}


In [86]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
#Build multinomial naive bayes model
clf = GaussianNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
#print classification report
print(metrics.classification_report(y_test, y_pred))
print('-------------------------------------------')

def calculate_category_prob(train):
    category_prob = {}
    total = len(train)
    for i in train.value_counts().index:
        category_prob[i] = train.value_counts()[i]/total
    return category_prob
prob = calculate_category_prob(y_train)
print('Category probabilities: ',prob)


Accuracy: 0.9328859060402684
               precision    recall  f1-score   support

     business       0.89      0.90      0.89       145
entertainment       0.95      0.97      0.96       116
     politics       0.93      0.83      0.88       103
        sport       0.98      1.00      0.99       131
         tech       0.91      0.95      0.93       101

     accuracy                           0.93       596
    macro avg       0.93      0.93      0.93       596
 weighted avg       0.93      0.93      0.93       596

-------------------------------------------
Category probabilities:  {'sport': 0.24049217002237136, 'business': 0.21364653243847875, 'politics': 0.1912751677852349, 'tech': 0.1789709172259508, 'entertainment': 0.1756152125279642}


In [87]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
#Build multinomial naive bayes model
clf = GaussianNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
#print classification report
print(metrics.classification_report(y_test, y_pred))
print('-------------------------------------------')

def calculate_category_prob(train):
    category_prob = {}
    total = len(train)
    for i in train.value_counts().index:
        category_prob[i] = train.value_counts()[i]/total
    return category_prob
prob = calculate_category_prob(y_train)
print('Category probabilities: ',prob)


Accuracy: 0.959731543624161
               precision    recall  f1-score   support

     business       0.92      0.95      0.94        64
entertainment       0.98      0.97      0.98        63
     politics       0.98      0.87      0.92        53
        sport       0.98      1.00      0.99        65
         tech       0.93      1.00      0.96        53

     accuracy                           0.96       298
    macro avg       0.96      0.96      0.96       298
 weighted avg       0.96      0.96      0.96       298

-------------------------------------------
Category probabilities:  {'sport': 0.23573825503355705, 'business': 0.22818791946308725, 'politics': 0.18540268456375839, 'entertainment': 0.1761744966442953, 'tech': 0.174496644295302}


In [92]:
#Calculate the total sum of tf-icf values for each category 
category_totals =  {}
for article in tf_icf_scores:
    category = tf_icf_scores[article]['Answer']
    if category not in category_totals:
        category_totals[category] = 0
    for feature in tf_icf_scores[article]:
        if feature != 'Answer':
            category_totals[category] += tf_icf_scores[article][feature]

#Calculate probability of each feature given each category
feature_prob = {}
for feature in word_dict:
    feature_prob[feature] = {}
    for category in class_count:
        category_total = category_totals[category]
        category_feature_total = 0
        if category in word_dict[feature]:
            category_feature_total = word_dict[feature][category]
        if category_total>0:
            feature_prob[feature][category] = category_feature_total / category_total
        else:
            feature_prob[feature][category] = 0

print(feature_prob)




In [91]:
feature_prob

{'worldcom': {'business': 0.00017450708082539628,
  'tech': 0.0,
  'politics': 0.0,
  'sport': 0.0,
  'entertainment': 0.0},
 'ex-boss': {'business': 6.463225215755417e-06,
  'tech': 0.0,
  'politics': 0.0,
  'sport': 0.0,
  'entertainment': 0.0},
 'launch': {'business': 0.0,
  'tech': 0.0,
  'politics': 0.0,
  'sport': 0.0,
  'entertainment': 0.0},
 'defence': {'business': 0.0,
  'tech': 0.0,
  'politics': 0.0,
  'sport': 0.0,
  'entertainment': 0.0},
 'lawyer': {'business': 0.0,
  'tech': 0.0,
  'politics': 0.0,
  'sport': 0.0,
  'entertainment': 0.0},
 'defending': {'business': 8.961060358060046e-07,
  'tech': 2.5544160657162264e-07,
  'politics': 1.2912802669279867e-06,
  'sport': 8.656976244339831e-06,
  'entertainment': 0.0},
 'former': {'business': 0.0,
  'tech': 0.0,
  'politics': 0.0,
  'sport': 0.0,
  'entertainment': 0.0},
 'chief': {'business': 0.0,
  'tech': 0.0,
  'politics': 0.0,
  'sport': 0.0,
  'entertainment': 0.0},
 'bernie': {'business': 2.5852900863021668e-05,
  '