In [1]:
# Importing necessary libraries
import numpy as np
import pandas as pd

In [2]:
# Reading the dataset
src_file = 'dataset.csv'
dataframe = pd.read_csv(src_file, encoding="utf8", quotechar="\"", engine='python', usecols=["TITLE", "CATEGORY"])

# Data Preprocessing

## Cleaning Data

In [3]:
# Checking for missing data
if any(dataframe.isnull().any()):
    print('Missing Data\n')
    print(dataframe.isnull().sum())
else:
    print('No missing data')

Missing Data

TITLE         0
CATEGORY    141
dtype: int64


In [4]:
# Checking for duplicate rows
if any(dataframe.duplicated()):
    print('Duplicate rows found')
    print('Number of duplicate rows= ', dataframe[dataframe.duplicated()].shape[0])
    dataframe.drop_duplicates(inplace=True, keep='first')
    dataframe.reset_index(inplace=True, drop=True)
    print('Dropping duplicates\n')
    print(dataframe.shape)
else:
    print('No duplicate data')

Duplicate rows found
Number of duplicate rows=  113
Dropping duplicates

(11823, 2)


## Data Preprocessing Pipeline

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import string
from sklearn import set_config

# Set sklearn config to output Pandas DataFrame
set_config(transform_output="pandas")
wnl = WordNetLemmatizer()


# Function for cleaning and tokenize the headline
def tokenize(doc):
    document = doc.lower()  # convert the content of the headline to lowercase
    document = re.sub(r'\d+', '', document)  # remove all the digits inside the content (using regular expressions)
    document = document.translate(str.maketrans('', '', string.punctuation))  # remove the punctuations (, . ! # ...)
    document = document.strip()  # remove the spaces at the start and end of the headline
    return [wnl.lemmatize(token) for token in word_tokenize(document) if token not in stopwords.words('english')]
    # tokenize the headlines
    # and then filter only the words that are not in the english stopwords (words that are commonly used and give no benefits to the classifier)
    # and finally templatize all the tokens


# Preprocessing Pipeline
preprocessor = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=tokenize, token_pattern=None)),
])

# Transforming the dataset using TF-IDF
tfidf_dataset = preprocessor.fit_transform(dataframe["TITLE"].values)

# Training Model

## Label encoder

In [6]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
class_label = le.fit_transform(dataframe["CATEGORY"])

## Train-Test Split

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report


X_train, X_test, y_train, y_test = train_test_split(
    tfidf_dataset.toarray(),
    class_label,
    test_size=0.3  # the size of the testing dataset (in percentage between 0 and 1)
)

## Decision Tree Classifier

In [8]:
from sklearn.tree import DecisionTreeClassifier

# Create a Decision Tree classifier
DTClass = DecisionTreeClassifier(criterion="gini", splitter="best", random_state=42)

# Train the classifier on the training dataset
DTClass.fit(X_train, y_train)

# Make predictions on the test data
y_pred_dt = DTClass.predict(X_test)

# Evaluating Decision Tree Model
print("Accuracy score of Decision Tree:", accuracy_score(y_test, y_pred_dt))
print("Precision score of Decision Tree:", precision_score(y_test, y_pred_dt, average='weighted', zero_division=1))
print("Recall score of Decision Tree:", recall_score(y_test, y_pred_dt, average='weighted', zero_division=1))
print("F1 score of Decision Tree:", f1_score(y_test, y_pred_dt, average='weighted', zero_division=1))


Accuracy score of Decision Tree: 0.3831406822667043
Precision score of Decision Tree: 0.386602608018235
Recall score of Decision Tree: 0.3831406822667043
F1 score of Decision Tree: 0.40764947400367807


In [9]:
print(classification_report(y_test, y_pred_dt, zero_division=1))

              precision    recall  f1-score   support

           0       0.00      0.00      1.00         2
           1       1.00      0.00      0.00         3
           2       1.00      0.00      0.00         2
           3       0.28      0.16      0.21        43
           4       0.52      0.71      0.60        17
           5       1.00      0.00      0.00         2
           6       1.00      0.12      0.22         8
           7       1.00      0.00      0.00         1
           8       0.00      0.00      1.00         6
           9       0.00      0.00      1.00         2
          10       0.50      0.67      0.57         3
          11       1.00      0.00      0.00         1
          12       0.50      0.45      0.48        11
          13       0.41      0.32      0.36        63
          14       0.45      0.50      0.48        10
          15       0.00      0.00      1.00         2
          16       0.00      1.00      0.00         0
          17       1.00    

## Naive Bayes Classifier

In [10]:
from sklearn.naive_bayes import MultinomialNB

# Create a Multinomial Naive Bayes classifier
NBClass = MultinomialNB()

# Train the classifier on the training dataset
NBClass.fit(X_train, y_train)

# Make predictions on the test data
y_pred_nb = NBClass.predict(X_test)

# Evaluating Naive Bayes Model
print("Accuracy score of Naive Bayes:", accuracy_score(y_test, y_pred_nb))
print("Precision score of Naive Bayes:", precision_score(y_test, y_pred_nb, average='weighted', zero_division=1))
print("Recall score of Naive Bayes:", recall_score(y_test, y_pred_nb, average='weighted', zero_division=1))
print("F1 score of Naive Bayes:", f1_score(y_test, y_pred_nb, average='weighted', zero_division=1))


Accuracy score of Naive Bayes: 0.28813081477304764
Precision score of Naive Bayes: 0.7179151969708233
Recall score of Naive Bayes: 0.28813081477304764
F1 score of Naive Bayes: 0.20320948016608492


In [11]:
print(classification_report(y_test, y_pred_nb, zero_division=1))

              precision    recall  f1-score   support

           0       1.00      0.00      0.00         2
           1       1.00      0.00      0.00         3
           2       1.00      0.00      0.00         2
           3       1.00      0.00      0.00        43
           4       1.00      0.00      0.00        17
           5       1.00      0.00      0.00         2
           6       1.00      0.00      0.00         8
           7       1.00      0.00      0.00         1
           8       1.00      0.00      0.00         6
           9       1.00      0.00      0.00         2
          10       1.00      0.00      0.00         3
          11       1.00      0.00      0.00         1
          12       1.00      0.00      0.00        11
          13       1.00      0.00      0.00        63
          14       1.00      0.00      0.00        10
          15       1.00      0.00      0.00         2
          17       1.00      0.00      0.00         1
          18       1.00    

## Neural Network Classifier

In [12]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# Build the neural network model
model = Sequential()
# Input layer
model.add(Dense(16, activation='relu', input_shape=(X_train.shape[1],)))
# Hidden layers
model.add(Dense(32, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
# Output layer
num_classes = len(np.unique(class_label))
model.add(Dense(num_classes, activation='softmax'))

# Compile the model
model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.1)

# Make predictions on the test data
y_pred_nn = np.argmax(model.predict(X_test), axis=-1)

# Evaluating Neural Network Model
print("Accuracy score of Neural Network:", accuracy_score(y_test, y_pred_nn))
print("Precision score of Neural Network:", precision_score(y_test, y_pred_nn, average='weighted', zero_division=1))
print("Recall score of Neural Network:", recall_score(y_test, y_pred_nn, average='weighted', zero_division=1))
print("F1 score of Neural Network:", f1_score(y_test, y_pred_nn, average='weighted', zero_division=1))




Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Accuracy score of Neural Network: 0.21962221595714687
Precision score of Neural Network: 0.3374365345105932
Recall score of Neural Network: 0.21962221595714687
F1 score of Neural Network: 0.3772620767134382


In [13]:
print(classification_report(y_test, y_pred_nn, zero_division=1))

              precision    recall  f1-score   support

           0       0.00      0.00      1.00         2
           1       0.00      0.00      1.00         3
           2       1.00      0.00      0.00         2
           3       0.25      0.09      0.14        43
           4       0.00      0.00      1.00        17
           5       1.00      0.00      0.00         2
           6       0.00      0.00      1.00         8
           7       1.00      0.00      0.00         1
           8       1.00      0.00      0.00         6
           9       0.00      0.00      1.00         2
          10       0.00      0.00      1.00         3
          11       1.00      0.00      0.00         1
          12       0.00      0.00      1.00        11
          13       0.19      0.16      0.17        63
          14       1.00      0.10      0.18        10
          15       0.00      0.00      1.00         2
          17       1.00      0.00      0.00         1
          18       1.00    