In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from Model import SoftmaxRegressionCustom
from Data_Preprocessing import TextClassifier
import sys

df = pd.read_csv('Data.csv')

X = df['Summary']
y = df['Priority']  

text_classifier = TextClassifier()
text_classifier.fit(X, y)

text_classifier.save('priority_prediction.pkl')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_preprocessed = [text_classifier.preprocess(text) for text in X_train]

# Convert the train data into TF-IDF features using the same vectorizer
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_preprocessed)
y_encoded = text_classifier.label_encoder.fit_transform(y_train)

# Train the custom softmax regression model
softmax_model = SoftmaxRegressionCustom()
softmax_model.fit(X_train_tfidf.toarray(), pd.get_dummies(y_encoded).values)

y_pred_softmax = softmax_model.predict(X_train_tfidf.toarray())

# Redirect stdout to a file
with open('output.txt', 'w') as f:
    sys.stdout = f  # Change the standard output to the file we created.
    
    print(f"Custom Softmax Regression Train Accuracy: {softmax_model.accuracy(y_encoded, y_pred_softmax):.4f}")
    print(f"Classification Report:\n{classification_report(y_encoded, y_pred_softmax, target_names=text_classifier.label_encoder.classes_)}")
    
    # Predict on the test set using both models for comparison
    X_test_preprocessed = [text_classifier.preprocess(text) for text in X_test]
    X_test_tfidf = vectorizer.transform(X_test_preprocessed)
    y_test_encoded = text_classifier.label_encoder.transform(y_test)
    
    # Custom Softmax Regression predictions
    y_pred_softmax_test = softmax_model.predict(X_test_tfidf.toarray())
    
    print(f"Custom Softmax Regression Test Accuracy: {softmax_model.accuracy(y_test_encoded, y_pred_softmax_test):.4f}")
    print(f"Classification Report:\n{classification_report(y_test_encoded, y_pred_softmax_test, target_names=text_classifier.label_encoder.classes_)}")
    
    sys.stdout = sys.__stdout__


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ascom\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ascom\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ascom\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Iteration 0: Cost 0.9584
Iteration 100: Cost 0.2954
Iteration 200: Cost 0.2318
Iteration 300: Cost 0.1989
Iteration 400: Cost 0.1776
Iteration 500: Cost 0.1621
Iteration 600: Cost 0.1501
Iteration 700: Cost 0.1404
Iteration 800: Cost 0.1323
Iteration 900: Cost 0.1255
Iteration 999: Cost 0.1196
