In [1]:
import os
import joblib
import json
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
import re
def clean_text(text):
  text = text.lower()  # Convert to lowercase
  text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation and non-alphanumeric characters
  text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
  text = re.sub(r'\d+', '', text)  # Remove numbers
  return text 

In [3]:
from PyPDF2 import PdfReader
from docx import Document
def read_document(file_path, filename):
    # Text extraction based on file extension
    if filename.endswith(".pdf"):
        with open(file_path, 'rb') as pdf_file:
            # Use PyPDF2 for PDF text extraction
            pdf_reader = PdfReader(pdf_file)
            text = ""
            for page in pdf_reader.pages:
                text += page.extract_text()

    elif filename.endswith(".docx"):
        # Use python-docx for Word document text extraction
        doc = Document(file_path)
        text = ""
        for paragraph in doc.paragraphs:
            text += paragraph.text

    elif filename.endswith(".txt"):
        # Read text directly for TXT files
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            
    else:
        # Skip unsupported file formats
        return None
    
    # Preprocessing steps (clean text, lowercase, etc.)
    text = clean_text(text) # Implement your cleaning function here
    return text

In [4]:
data_dir = r"D:\Documents\USM\USM_NotesExercises\Year 4 Sem 1\CAT405\Logistic Regression Model\Dataset" # Replace with your data directory path
documents = []
labels = []
for class_dir in os.listdir(data_dir):
    class_path = os.path.join(data_dir, class_dir)
    
    if os.path.isdir(class_path):
        for filename in os.listdir(class_path):
            file_path = os.path.join(class_path, filename)
            text = read_document(file_path, filename)
            
            # Skip unsupported file formats
            if text == None:
                continue

            documents.append(text)
            labels.append([filename, class_dir])

In [5]:
# Load the dataset
# Create a DataFrame from the collected data
df = pd.DataFrame(labels, columns=['Doc Name', 'Class'])
df

Unnamed: 0,Doc Name,Class
0,Academic Calendar 2020_2021.pdf,Academic
1,Academic Calendar 2021_2022.pdf,Academic
2,Academic Calendar 2022_2023.pdf,Academic
3,Academic Calendar 2023_2024.pdf,Academic
4,Academic.pdf,Academic
...,...,...
609,Personnel_95.txt,Personnel
610,Personnel_96.txt,Personnel
611,Personnel_97.txt,Personnel
612,Personnel_98.txt,Personnel


In [6]:
# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(documents)
y = df['Class']

joblib.dump(vectorizer, 'TFIDFvectorizer.pkl')
vectorizer_dict = {
  "Vocabulary": {word: int(value) for word, value in vectorizer.vocabulary_.items()},
  "IDF": vectorizer.idf_.tolist(),
}
with open("TFIDFvectorizer.json", "w") as f:
  json.dump(vectorizer_dict, f)
# with open("TFIDFvectorizer.json", "wb") as f:
#   joblib.dump(vectorizer, f)
# print(vectorizer)
# Save the vectorizer to a file
# import msgpack
# vocabulary = {word: int(value) for word, value in vectorizer.vocabulary_.items()} # Convert numpy int32 values to regular integers
# with open("TFIDFvectorizer.msgpack", "wb") as f:
#     msgpack.pack((vocabulary), f)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Logistic regression model
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=800)
trainedModel = model.fit(X_train, y_train)

# dense_features = X_train.toarray()
# model = model.fit(dense_features, y_train)

In [7]:
# Make predictions on the test set
y_pred = trainedModel.predict(X_test)

# Evaluate the model
print("Accuracy Score:", accuracy_score(y_test, y_pred), "\n")

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred), "\n")

print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy Score: 0.3170731707317073 

Confusion Matrix:
[[37  0  1  0  0]
 [14  0  0  0  0]
 [22  0  2  0  0]
 [17  0  0  0  0]
 [29  0  1  0  0]] 

Classification Report:
                precision    recall  f1-score   support

      Academic       0.31      0.97      0.47        38
Administrative       0.00      0.00      0.00        14
 Co-curricular       0.50      0.08      0.14        24
     Financial       0.00      0.00      0.00        17
     Personnel       0.00      0.00      0.00        30

      accuracy                           0.32       123
     macro avg       0.16      0.21      0.12       123
  weighted avg       0.19      0.32      0.17       123



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
# Use the model for prediction
input_file_directory = r"D:\Documents\USM\USM_NotesExercises\Year 4 Sem 1\CAT405\dataset\Food\food_1.txt"
input_file_name = input_file_directory.split("\\")[-1]
new_document = read_document(input_file_directory, input_file_name)
new_features = vectorizer.transform([new_document])
prediction = trainedModel.predict(new_features)
prediction_proba = trainedModel.predict_proba(new_features)

# Function to get predicted classes exceeding the threshold
def get_predicted_classes(probabilities):
    threshold = 0.1
    class_labels = df['Class'].unique()
    top_classes = [class_labels[i] for i, p in enumerate(probabilities[0]) if p > threshold]
    return top_classes

predicted_classes = get_predicted_classes(prediction_proba)

print("=====================================================================")
print("Predicted Class   :", prediction[0])
print("Predicted Classes :", predicted_classes, "\n")
print("~~~~~~~~~~ Predicted Probability ~~~~~~~~~~")
print("Predicted probability:")
print("Academic\t:", prediction_proba[0][0])
print("Administrative\t:", prediction_proba[0][1])
print("Co-curricular\t\t:", prediction_proba[0][2])
print("Financial\t:", prediction_proba[0][3])
print("Personnel\t:", prediction_proba[0][4])
# print("Medical\t\t:", prediction_proba[0][5])
# print("Politics\t:", prediction_proba[0][6])
# print("Space\t\t:", prediction_proba[0][7])
# print("Sport\t\t:", prediction_proba[0][8])
# print("Technologie\t:", prediction_proba[0][9])
print("=====================================================================\n")

Predicted Class   : Academic
Predicted Classes : ['Academic', 'Administrative', 'Co-curricular'] 

~~~~~~~~~~ Predicted Probability ~~~~~~~~~~
Predicted probability:
Business	: 0.6746917259678679
Entertainment	: 0.1317899152055091
Food		: 0.12092210251531037
Graphics	: 0.03794974219735643
Historical	: 0.03464651411395627


IndexError: index 5 is out of bounds for axis 0 with size 5

In [None]:
# import joblib
# joblib.dump(model, 'DocClassificationLrModel.pkl')

In [None]:
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx import convert_sklearn

# Convert the model to ONNX
# initial_type = [('features', onnx.TensorProto.FLOAT)] # Use onnx.TensorProto.FLOAT32
initial_type = [('features', FloatTensorType([None, None]))]
onx = convert_sklearn(trainedModel, initial_types=initial_type)

# Save the converted model
with open("DocClassificationLrModel.onnx", "wb") as f:
    f.write(onx.SerializeToString())

print("Pipeline conversion complete!")

# pklmodel = joblib.load(r"D:\Documents\USM\USM_NotesExercises\Year 4 Sem 1\CAT405\Logistic Regression Model\DocClassificationLrModel.pkl")
# onnx.save_model(pklmodel, "DocClassificationLrModel.onnx")

Pipeline conversion complete!
