In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import classification_report

from sklearn.model_selection import cross_val_score, StratifiedKFold

import pickle


In [2]:
df = pd.read_csv('../csv/final_data.csv')
df.sample(3)

Unnamed: 0,label,text,priority,urgency,type,queue
6941,0,Infrequent user session expirations happening ...,low,not_urgent,,
5627,0,Data-driven alert thresholds not adjusting dyn...,low,not_urgent,,
9659,1,Attention: Requires Action. This is a at risk ...,high,urgent,,


# Functions

In [4]:
def get_train_test_data(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"X_train shape: {X_train.shape}")
    print(f"X_test shape: {X_test.shape}")
    return X_train, X_test, y_train, y_test

In [3]:
# Training the models
def train_model(model, tdidf_vectorizer, X_train, y_train, X_test, y_test):
    text_transformer = Pipeline([
    ('tfidf', tdidf_vectorizer)
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('text_tfidf', text_transformer, 'text')
        ],
    )
    pipeline =Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    print(f"Classification Report: \n{classification_report(y_test, y_pred)}")
    return pipeline

In [52]:
def test_model(model, threshold = 0.5):
    example_text = pd.DataFrame([
        'This is an urgent issue',
        'I need help with my account right now without delay',
        'I have a question about my account',
        'I cannot access my account. Please help me',
        'I am unable to access my account. I have tried resetting my password but it is not working. I need help urgently',
        "Query regarding my account. Please help me",
        "The server is down. Please fix it as soon as possible",
        "System Issue. Need help before the end of the day",
        "Jira tickets need to be updated. Please do it ASAP",
        "Not Critical. Please ensure system is up and running without any issues before the end of the day.",
        "this is a reminder to fill out your kaizen immediately by EOD",
        "What is the status of the server?",
        "I am having a headache and cannot focus on my work. I need to take a leave",
        "I need my salary for this month to be credited to my account",
        "how can i fix my PC not starting up, I have lots of work pending",
        "Can you fix this bug in the code? It is causing issues in the system",
         "i am having payment issue with my card.",
         "The client presentation is scheduled for 3 PM, but the slides haven\’t been finalized yet. Can you review them ASAP?",
         "Our code has been deployed to production, but the system is down. Can you investigate the issue?"
        ],
    columns=['text'])
    
    true_labels = [1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1]

    y_scores = model.predict_proba(example_text[['text']])[:, 1]
    threshold = threshold
    y_pred = (y_scores >= threshold).astype(int)

    print(f"Threshold: {threshold}")

    print(f"\nPredicted Labels: {y_pred.tolist()}")
    print(f"True Labels: {true_labels}")
    print(f"Classification Report: \n{classification_report(true_labels, y_pred, labels=[1, 0])}")

In [57]:
# save model
def save_model(model, filename):
    with open("../models/"+filename, 'wb') as file:
        pickle.dump(model, file)
    print(f"Model saved as {filename}")

# Process

In [5]:
X = df[['text']]
y = df['label']
X_train, X_test, y_train, y_test = get_train_test_data(X, y)

X_train shape: (9523, 1)
X_test shape: (2381, 1)


In [36]:
pipeline_naive_bayes = train_model(
    MultinomialNB(alpha=1),
    TfidfVectorizer(), 
    X_train,
    y_train,
    X_test,
    y_test
)

Classification Report: 
              precision    recall  f1-score   support

           0       0.80      0.92      0.85      1117
           1       0.92      0.79      0.85      1264

    accuracy                           0.85      2381
   macro avg       0.86      0.86      0.85      2381
weighted avg       0.86      0.85      0.85      2381



In [54]:
test_model(pipeline_naive_bayes, threshold=0.55)

Threshold: 0.55

Predicted Labels: [1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1]
True Labels: [1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1]
Classification Report: 
              precision    recall  f1-score   support

           1       0.93      0.93      0.93        15
           0       0.75      0.75      0.75         4

    accuracy                           0.89        19
   macro avg       0.84      0.84      0.84        19
weighted avg       0.89      0.89      0.89        19



In [58]:
# save the model to disk
save_model(pipeline_naive_bayes, 'naive_bayes_model.pkl')

Model saved as naive_bayes_model.pkl


In [29]:
# stacking with weighted average
def create_stacked_model(models, meta_model = LogisticRegression(random_state=42, max_iter=1000)):
    stacked_model = StackingClassifier(
        estimators=models,
        final_estimator=meta_model,
    )
    
    stacked_model.fit(X_train, y_train)
    y_pred = stacked_model.predict(X_test)
    print(f"Classification Report: \n{classification_report(y_test, y_pred)}")
    return stacked_model

models = [
    ('nb', MultinomialNB()),
    ('svm', SVC(probability=True))
]

stacked_model = create_stacked_model(models)


ValueError: could not convert string to float: 'Old company logo still appearing in transactional emails'