Import required libraries

In [46]:
import nltk
import re
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report



download necessary Natural language toolkit libriries for lemmen and tokenization

In [None]:
nltk.download(['punkt','punkt_tab','stopwords'], quiet=True)

separate data into train test sets

In [48]:
train_df = pd.read_csv('https://raw.githubusercontent.com/Jana-Liebenberg/2401PTDS_Classification_Project/main/Data/processed/train.csv', sep=',', encoding='utf-8')
test_df = pd.read_csv('https://raw.githubusercontent.com/Jana-Liebenberg/2401PTDS_Classification_Project/main/Data/processed/test.csv', sep=',', encoding='utf-8')

View dimensions of graph

In [None]:
train_plot = train_df['category'].value_counts().plot(kind='bar')
test_plot = test_df['category'].value_counts().plot(kind='bar')


plt.show()

### 1) Format data
in the following order:
1) test on headline data given it is easier to split


In [50]:

def remove_punctuation_numbers_and_stopwords(text):
    text = text.lower()

    pun_nums = string.punctuation + "0123456789"
    stop_words = set(stopwords.words('english'))
    set_to_remove = stop_words.union(set(pun_nums))
    word_tokens = word_tokenize(text)   #Tokenize text to find stopwords more easily 

    filterd_text = " ".join([w for w in word_tokens if w not in set_to_remove])

    second_check = re.sub(r'\d','',filterd_text)

    return second_check

In [51]:
"""
Here we remove any punctuations and numbers that might have been in the data using the method declared
before for:
"""

#Headlines Data:
train_df['headlines'] = train_df['headlines'].apply(remove_punctuation_numbers_and_stopwords)
test_df['headlines'] = test_df['headlines'].apply(remove_punctuation_numbers_and_stopwords)

#Description Data:
train_df['description'] = train_df['description'].apply(remove_punctuation_numbers_and_stopwords)
test_df['description'] = test_df['description'].apply(remove_punctuation_numbers_and_stopwords)

#Content Data:
train_df['content'] = train_df['content'].apply(remove_punctuation_numbers_and_stopwords)
test_df['content'] = test_df['content'].apply(remove_punctuation_numbers_and_stopwords)

### 2) Feature Engeneering

In [None]:
""""
Here we format our target variables:

"""
y_train = train_df['category']
y_test = test_df['category']

y_train.shape

In [None]:
"""
Here we format our featurs:

we implement the TfidfVectorizer

"""
def vectorize_data(column):
    vect = CountVectorizer(stop_words='english',min_df=0.01)
    X_train = vect.fit_transform(train_df[column]).toarray()       #we use .toarray() to convert the matrix into a numpy array
    X_test = vect.transform(test_df[column]).toarray()
    vocabulary = vect.get_feature_names_out()

    #Pickel Vectoriser:
    with open('.vectoriser_file/vect.pkl', 'wb') as file:
        pickle.dump(vect, file)

    return X_train, X_test, vocabulary

X_headlines_train, X_headlines_test, headlines_vocabulary = vectorize_data('headlines')
X_description_train, X_description_test, description_vocabulary = vectorize_data('description')
X_content_train, X_content_test, content_vocabulary = vectorize_data('content')


print('X_headlines-train:',X_headlines_train.shape)
print('X_headlines-test:',X_headlines_test.shape)

### 3) Model training  

In [None]:

def fit_random_forest(X_train, y_train, X_test, y_test):
   
    rf_model = RandomForestClassifier()
    rf_model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = rf_model.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Generate confusion matrix
    rf_cm = confusion_matrix(y_test, y_pred)


    #Generate classification report
    classification_report_str = classification_report(y_test, y_pred)

    return rf_model, rf_cm, accuracy, classification_report_str



rf_model, rf_cm, rf_accuracy, rf_report = fit_random_forest(X_content_train, y_train, X_content_test, y_test)

print('Random Forest Accuracy:', rf_accuracy)
print("\nClassification Report:\n", rf_report)

import pickle

# Pickel the model:
with open('.models_file/rf_model.pkl', 'wb') as file:
    pickle.dump(rf_model, file)


In [None]:
def fit_multinomial_nb_multiclass(X_train, y_train, X_test, y_test):
    
    nb_model = MultinomialNB()  # Initialize the Multinomial Naive Bayes model
    nb_model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = nb_model.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Generate confusion matrix
    nb_cm = confusion_matrix(y_test, y_pred)

    # Generate classification report
    classification_report_str = classification_report(y_test, y_pred)

    return nb_model, nb_cm, accuracy, classification_report_str



nb_model, nb_cm, nb_accuracy, nb_report = fit_multinomial_nb_multiclass(X_content_train, y_train, X_content_test, y_test)

print('Multinomial Naive Bayes Multi-Class Accuracy:', nb_accuracy)
print("\nClassification Report:\n", nb_report)

# Pickel the model:

with open('.models_file/nb_model.pkl', 'wb') as file:
    pickle.dump(nb_model, file)


In [None]:
# Lets get a visual breakdown:
 
def display_matrix_graph(class_names=None, rf_cm=None, nb_cm=None):
    fig, axes = plt.subplots(1, 2, figsize=(16, 8))  # Create subplots

    sns.heatmap(rf_cm, annot=True, fmt="d", cmap="Reds", 
                xticklabels=class_names, yticklabels=class_names, ax=axes[0])
    axes[0].set_xlabel("Predicted")
    axes[0].set_ylabel("True")
    axes[0].set_title("Random Forest Confusion Matrix")

    sns.heatmap(nb_cm, annot=True, fmt="d", cmap="Blues", 
                xticklabels=class_names, yticklabels=class_names, ax=axes[1])
    axes[1].set_xlabel("Predicted")
    axes[1].set_ylabel("True")
    axes[1].set_title("Naive Bayers Confusion Matrix")

    plt.tight_layout()
    plt.show()

class_names = y_test.unique()
display_matrix_graph(class_names,rf_cm,nb_cm)

In [None]:
# Lets see the most frequent words in our corpus:

vocabulary_map = {
    'headlines': headlines_vocabulary,
    'description': description_vocabulary,
    'content': content_vocabulary,
}

vocabulary_map