# Import Library

In [None]:
import pandas as pd
import numpy as np

# Read Dataset and load data into Dataframe

In [None]:
# Read Dataset
src_file = 'dataset.csv'

# Load data into Dataframe
dataframe = pd.read_csv(src_file, encoding="utf8", quotechar="\"", engine='python', usecols=["TITLE", "CATEGORY"])

# Data Preprocessing

In [None]:
# check for missing data
if any(dataframe.isnull().any()):
    print('Missing Data\n')
    print(dataframe.isnull().sum())
else:
    print('No missing data')

In [None]:
# check for duplicate
if any(dataframe.duplicated()):
    print('Duplicate rows found')
    print('Number of duplicate rows= ', dataframe[dataframe.duplicated()].shape[0])
    dataframe.drop_duplicates(inplace=True, keep='first')
    dataframe.reset_index(inplace=True, drop=True)
    print('Dropping duplicates\n')
    print(dataframe.shape)
else:
    print('No duplicate data')

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import string
from sklearn import set_config

set_config(transform_output="pandas")

wnl = WordNetLemmatizer()


# Function for cleaning and tokenize the headline
def tokenize(doc):
    document = doc.lower()  # convert the content of the headline to lowercase
    document = re.sub(r'\d+', '', document)  # remove all the digits inside the content (using regular expressions)
    document = document.translate(str.maketrans('', '', string.punctuation))  # remove the punctuations (, . ! # ...)
    document = document.strip()  # remove the spaces at the start and end of the headline
    return [wnl.lemmatize(token) for token in word_tokenize(document) if token not in stopwords.words('english')]
    # tokenize the headlines
    # and then filter only the words that are not in the english stopwords (words that are commonly used and give no benefits to the classifier)
    # and finally templatize all the tokens


# The preprocess pipeline
preprocessor = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=tokenize, token_pattern=None)),
])

tfidf_dataset = preprocessor.fit_transform(dataframe["TITLE"].values)  # process the training dataset

# Training Model

## Label encoder

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
class_label = le.fit_transform(dataframe["CATEGORY"])
list(le.classes_)
class_label

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

X_train, X_test, y_train, y_test = train_test_split(
    tfidf_dataset.toarray(),
    class_label,
    test_size=0.3  # the size of the testing dataset (in percentage between 0 and 1)
)

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Create a decision tree classifier
DTClass = DecisionTreeClassifier(criterion="gini", splitter="best", random_state=42)

# Train the classifier on the training dataset
DTClass.fit(X_train, y_train)

# Make predictions on the test data
y_pred_dt = DTClass.predict(X_test)

print("Accuracy score of Decision Tree:")
print(accuracy_score(y_test, y_pred_dt))

In [None]:
from sklearn.naive_bayes import MultinomialNB

# Create a Multinomial Naive Bayes classifier
NBClass = MultinomialNB()

# Train the classifier on the training dataset
NBClass.fit(X_train, y_train)

# Make predictions on the test data
y_pred_nb = NBClass.predict(X_test)

# Evaluate the performance of the model
print("Accuracy score of Naive Bayes:")
print(accuracy_score(y_test, y_pred_nb))

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# Build the neural network model
model = Sequential()
model.add(Dense(16, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dense(32, activation='relu'))

num_classes = len(np.unique(class_label))
model.add(Dense(num_classes, activation='softmax'))

# Compile the model
model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1)

# Make predictions on the test data
y_pred_nn = np.argmax(model.predict(X_test), axis=-1)

# Evaluate the performance of the model
print("Accuracy score of Neural Network:")
print(accuracy_score(y_test, y_pred_nn))

In [None]:
print(classification_report(y_test, y_pred_dt, zero_division=1))

In [None]:
print(classification_report(y_test, y_pred_nb, zero_division=1))

In [None]:
print(classification_report(y_test, y_pred_nn, zero_division=1))