In [3]:
import os
from tqdm.notebook import tqdm,tnrange
from bs4 import BeautifulSoup as bs
from nltk.stem import PorterStemmer,SnowballStemmer,WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords
import json
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
#punkt and stopqwords
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/darshparikh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/darshparikh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
df = pd.read_csv('BBC News Train.csv')
test = pd.read_csv('BBC News Test.csv')
df.head()

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


In [7]:
len(df['Category'].value_counts())

5

In [17]:
def preprocess(text):
    text = str(text)
    text = text.lower()
    #perform tokenization
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # remove punctuations but not hyphen separated words
    tokens = [w for w in tokens if w.isalpha() or '-' in w]
    # remove words with numbers
    tokens = [w for w in tokens if not any(c.isdigit() for c in w)]
    # Remove blank space tokens
    tokens = [w for w in tokens if w.strip()]
    # remove single character tokens
    tokens = [w for w in tokens if len(w) > 1]

    #join tokens back to sentence
    text = ' '.join(tokens)
    
    #perform lemmatization
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(w) for w in text.split()])
    return text

    


In [18]:
df['Text'] = df['Text'].apply(preprocess)
text = df['Text']
text.head()

0    worldcom ex-boss launch defenc lawyer defend f...
1    german busi confid slide german busi confid fe...
2    bbc poll indic econom gloom citizen major nati...
3    lifestyl govern mobil choic faster better funk...
4    enron bos payout eighteen former enron directo...
Name: Text, dtype: object

In [30]:
word_dict = {}

# Loop through the dataset
for i in range(len(df)):
    # Get the text and category of the current row
    text = df['Text'].iloc[i]
    category = df['Category'].iloc[i]
    
    # Tokenize the text into words
    words = text.split()
    
    # Loop through the words and update the dictionary
    for word in words:
        if word not in word_dict:
            # If the word is not already in the dictionary, add it with an empty dictionary as its value
            word_dict[word] = {}
        
        if category not in word_dict[word]:
            # If the category is not already in the dictionary for the current word, add it with a count of 1
            word_dict[word][category] = 1
        else:
            # If the category is already in the dictionary for the current word, increment its count by 1
            word_dict[word][category] += 1

word_dict

{'worldcom': {'business': 54},
 'ex-boss': {'business': 2},
 'launch': {'business': 41,
  'tech': 120,
  'entertainment': 25,
  'sport': 8,
  'politics': 40},
 'defenc': {'business': 17,
  'sport': 37,
  'entertainment': 1,
  'tech': 9,
  'politics': 15},
 'lawyer': {'business': 26,
  'entertainment': 15,
  'tech': 8,
  'politics': 19,
  'sport': 7},
 'defend': {'business': 5,
  'sport': 75,
  'entertainment': 3,
  'tech': 8,
  'politics': 35},
 'former': {'business': 72,
  'entertainment': 52,
  'politics': 81,
  'sport': 85,
  'tech': 15},
 'chief': {'business': 154,
  'tech': 45,
  'sport': 24,
  'politics': 57,
  'entertainment': 15},
 'berni': {'business': 8},
 'ebber': {'business': 46},
 'batteri': {'business': 1, 'tech': 23, 'sport': 3, 'entertainment': 1},
 'fraud': {'business': 64, 'tech': 15, 'politics': 20, 'entertainment': 1},
 'charg': {'business': 69,
  'tech': 26,
  'entertainment': 22,
  'sport': 70,
  'politics': 38},
 'call': {'business': 88,
  'politics': 123,
  'ent

In [31]:
word_dict = {}

# Count the number of documents in each category
class_count = {}
for i in range(len(df)):
    category = df['Category'].iloc[i]
    if category not in class_count:
        class_count[category] = 1
    else:
        class_count[category] += 1

class_count

{'business': 336,
 'tech': 261,
 'politics': 274,
 'sport': 346,
 'entertainment': 273}

In [34]:
word_dict = {}
class_count = {}
for i in range(len(df)):
    category = df['Category'].iloc[i]
    if category not in class_count:
        class_count[category] = 1
    else:
        class_count[category] += 1

# Loop through the dataset
for i in range(len(df)):
    # Get the text and category of the current row
    text = df['Text'].iloc[i]
    category = df['Category'].iloc[i]
    
    # Tokenize the text into words
    words = text.split()
    
    # Loop through the words and update the dictionary
    for word in words:
        if word not in word_dict:
            # If the word is not already in the dictionary, add it with an empty dictionary as its value
            word_dict[word] = {}
        
        if category not in word_dict[word]:
            # If the category is not already in the dictionary for the current word, add it with a count of 1
            word_dict[word][category] = 1
        else:
            # If the category is already in the dictionary for the current word, increment its count by 1
            word_dict[word][category] += 1

# Calculate the inverse class frequency for each word and category
for word in word_dict:
    cf = len(word_dict[word])
    icf = math.log10(len(class_count) / cf)
    word_dict[word]['ICF'] = icf

# Calculate the TF-ICF score for each word and category
for word in word_dict:
    for category in word_dict[word]:
        if category != 'ICF':
            tf = word_dict[word][category]
            icf = word_dict[word]['ICF']
            tf_icf = tf * icf
            word_dict[word][category] = tf_icf

In [36]:
print(word_dict)

{'worldcom': {'business': 37.74438023414502, 'ICF': 0.6989700043360189}, 'ex-boss': {'business': 1.3979400086720377, 'ICF': 0.6989700043360189}, 'launch': {'business': 0.0, 'tech': 0.0, 'entertainment': 0.0, 'sport': 0.0, 'politics': 0.0, 'ICF': 0.0}, 'defenc': {'business': 0.0, 'sport': 0.0, 'entertainment': 0.0, 'tech': 0.0, 'politics': 0.0, 'ICF': 0.0}, 'lawyer': {'business': 0.0, 'entertainment': 0.0, 'tech': 0.0, 'politics': 0.0, 'sport': 0.0, 'ICF': 0.0}, 'defend': {'business': 0.0, 'sport': 0.0, 'entertainment': 0.0, 'tech': 0.0, 'politics': 0.0, 'ICF': 0.0}, 'former': {'business': 0.0, 'entertainment': 0.0, 'politics': 0.0, 'sport': 0.0, 'tech': 0.0, 'ICF': 0.0}, 'chief': {'business': 0.0, 'tech': 0.0, 'sport': 0.0, 'politics': 0.0, 'entertainment': 0.0, 'ICF': 0.0}, 'berni': {'business': 5.591760034688151, 'ICF': 0.6989700043360189}, 'ebber': {'business': 32.15262019945687, 'ICF': 0.6989700043360189}, 'batteri': {'business': 0.09691001300805642, 'tech': 2.2289302991852975, 'sp