In [None]:
#NAIVE BAYES pipeline + TF-IDF

#Ticket Routeing with Fastapi

# 'You're File Here' is where you put your Data.csv
# 'Column' is where you want to use as a Column such as:  
# 'ID, groups etc
# Data not Included


In [None]:
#Imports
import pandas as pd
import threading
import pickle
import uvicorn
import uuid
import json
import nest_asyncio
import os
import matplotlib.pyplot as plt
import random
import numpy as np
import joblib
from pathlib import Path
from fastapi import FastAPI, HTTPException
from fastapi import UploadFile, File, BackgroundTasks
from pydantic import BaseModel
from typing import List
from datetime import datetime
from openpyxl import load_workbook
from openpyxl.styles import PatternFill, Font
from openpyxl.utils import get_column_letter
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from collections import Counter

In [None]:
#File path for the data to be cleaned
file_path = r"Youre File Here"

In [None]:
#load Data
df_raw = pd.read_csv(file_path)
df_raw.head(10)

In [None]:
#Cleaning the Data - duplicates, Special Characters, normalizeation, check for NaNs, whitespace, Outliers

#confirm before
print("Missing values before cleaning: ")
print(df_raw.isna().sum())
print("Duplicate rows before cleaning:", df_raw.duplicated().sum())
print("Original rows:", len(df_raw))
print("Columns before removing duplicates:", len(df_raw.columns))

#remove rows with NaNs
df_cleaned = df_raw.dropna()
#remove duplicates
df_cleaned = df_raw.drop_duplicates()
#remove special characters
df_cleaned = df_raw.replace(r'[^\w\s.,-]', '', regex=True)

#confirm after
print("\nMissing values after cleaning: ")
print(df_cleaned.isna().sum())
print("Cleaned rows:", len(df_cleaned))
print("Duplicate rows after cleaning:", df_cleaned.duplicated().sum()) 
print("Columns after removing duplicates:", len(df_cleaned.columns))  

In [None]:
#Export to CSV for savings
output_path = "Youre File Here"
df_cleaned.to_csv(output_path, index=False)
print(f"Cleaned data successfully saved to '{output_path}'")

In [None]:
#Pull what is needed for the data - short_description type str, assignment_group type str priority type str
df = pd.read_csv(r"Youre File Here")
df_cleaned = df[['short_description', 'assignment_group', 'priority']]
df['short_description']
df['assignment_group']
df['priority']
df_cleaned.head(10)

output_path = "Youre File Here"
df_cleaned.to_csv(output_path, index=False)
print(f"Cleaned data successfully saved to '{output_path}'")

In [None]:
#imbalanced Data Check
print("This is the short description column: ")
print(df['Your Column Here'].value_counts(normalize=True)) #Balanced

print("\nThis is the assignment_group column: ")   
print(df['Your Column Here'].value_counts(normalize=True)) #unBalanced

In [None]:
#load the to be Balanced Data
df_before = pd.read_csv(r'C:\Users\2130537\Desktop\Giant_eagle\data_clean_balanced\Before_balance.csv')

X_raw = df[['Column', 'column', 'column']].fillna(' ')

# Step 1: Extract X and y from the cleaned DataFrame
X_raw = df_before['Column'].astype(str) + '  ' + df_before["Column'].astype(str)
y_raw = df_before['Column'].astype(str)

# Step 2: Encode assignment groups
le_target = LabelEncoder()
y_labels = le_target.fit_transform(y_raw)

#save
joblib.dump(le_target, r"Youre File Here")

# Step 3: Filter out labels with too few samples
label_counts = Counter(y_labels)
#print("Original class distribution:", label_counts)

#step 4
min_samples = 4
valid_classes = [label for label, count in label_counts.items() if count >= min_samples]
mask = np.isin(y_labels, valid_classes)

# Step 5
X_filtered = X_raw[mask]
y_filtered = y_labels[mask]
y_raw_filtered = y_raw[mask]

# Step 6 Splitting Data before SMOTE
X_train, X_test, y_train, y_test = train_test_split(X_filtered, y_filtered, test_size=0.2, random_state=42, stratify=y_filtered)

# Step 7: TF-IDF vectorization
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Step 8: Plot class distribution
plt.figure(figsize=(9, 7))
y_raw_filtered = y_raw[mask]  # Only show filtered classes
y_raw_filtered.value_counts().plot(kind='bar', color='red')
plt.xticks(rotation=90)
plt.locator_params(axis='x', nbins=50)
plt.title('Class Distribution Before SMOTE')
plt.xlabel('Youre File Here')
plt.ylabel('Number of Tickets')
plt.tight_layout()
plt.show()




In [None]:
#pd.set_option('display.max_rows', 10)
print(df_before['Youre File Here'].value_counts())


In [None]:
#Applying SMOTE to balance the classes
#X_filtered is a dataframe and y_filtered is a series

# 1. Apply SMOTE
smote = SMOTE(random_state=42, k_neighbors=1)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_vec, y_train)

# 2. Decode y_resampled back to original labels
y_decoded = le_target.inverse_transform(y_train_resampled)

# 4. Build DataFrame
df_resampled = pd.DataFrame(X_train_resampled)
df_resampled['Your File Here'] = y_decoded

# 5. Plot class distribution
resampled_counts = Counter(y_decoded)
top_10 = resampled_counts.most_common(10)

labels, counts = zip(*top_10)

In [None]:
# 5. Save to 0CSV
df_resampled.to_csv(r"Youre File Here")
print("Successfully Saved")

In [None]:
df_complete = pd.read_csv(r"Youre File Here")


plt.figure(figsize=(8, 5))
plt.bar(labels, counts, color='purple')
plt.xlabel(""Youre File Here"")
plt.ylabel("Number of Tickets")
plt.title("Class Distrabution After SMOTE")
plt.xticks(rotation=90, ha='right')
plt.tight_layout()
plt.show()

In [None]:
#Naive TF-IDF     
nb_model = MultinomialNB()
nb_model.fit(X_train_resampled, y_train_resampled)  #Train

In [None]:
#Prediction
y_pred = nb_model.predict(X_test_vec)
print(classification_report(y_test, y_pred))

In [None]:
import joblib
from datetime import datetime

# === Setup ===
model_name = "Department Parser full"
vectorizer_name = "Vectorizer"

model_folder = r"Youre File Here"
vectorizer_folder = r"Youre File Here"
json_only_folder = r"Youre File Here"

# Create folders if they don't exist
os.makedirs(model_folder, exist_ok=True)
os.makedirs(vectorizer_folder, exist_ok=True)
os.makedirs(json_only_folder, exist_ok=True)

# === Timestamp for file versions ===
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# === Save the Model ===
model_filename = f"{model_name}_{timestamp}.pkl"
model_path = os.path.join(model_folder, model_filename)
joblib.dump(nb_model, model_path)
print(f"Model saved to: {model_path}")

# === Save the Vectorizer ===
vectorizer_filename = f"{vectorizer_name}_{timestamp}.pkl"
vectorizer_path = os.path.join(vectorizer_folder, vectorizer_filename)
joblib.dump(vectorizer, vectorizer_path)
print(f"Vectorizer saved to: {vectorizer_path}")

In [None]:
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import Dict
import joblib
from pathlib import Path
import os
import nest_asyncio
import uvicorn

nest_asyncio.apply()  # For notebook use

# === Folder paths ===
MODEL_FOLDER = Path(r"Youre File Here")
VECTORIZER_FOLDER = Path(r"Youre File Here"

MODEL_FOLDER.mkdir(exist_ok=True)
VECTORIZER_FOLDER.mkdir(exist_ok=True)

# === Load most recent .pkl file ===
def get_latest_file(folder: Path):
    files = list(folder.glob("*.pkl"))
    return max(files, key=lambda f: f.stat().st_mtime) if files else None

# === Load model ===
model_path = get_latest_file(MODEL_FOLDER)
if model_path is None:
    raise FileNotFoundError("No model file found!")
model = joblib.load(model_path)

# === Load vectorizer ===
vectorizer_path = get_latest_file(VECTORIZER_FOLDER)
if vectorizer_path is None:
    raise FileNotFoundError(" No vectorizer file found!")
vectorizer = joblib.load(vectorizer_path)

# === FastAPI ===
app = FastAPI(title="Ticket Department Classifier API")

label_encoder = joblib.load(r"Youre File Here")

from typing import List, Union

class PredictRequest(BaseModel):
    short_description: Union[str, None] = None
    short_descriptions: Union[List[str], None] = None

@app.get("/")
def root():
    return {"message": "Model API is up!"}

@app.post("/predict")
def predict(request: PredictRequest) -> Dict[str, Union[str, List[str]]]:
    try:
        # Handle single or multiple inputs
        if request.short_description:
            descriptions = [request.short_description]
        elif request.short_descriptions:
            descriptions = request.short_descriptions
        else:
            raise ValueError("You must provide either 'Column' or 'Column')

        # Vectorize and predict
        transformed = vectorizer.transform(descriptions)
        predictions = model.predict(transformed)
        departments = label_encoder.inverse_transform(predictions).tolist()

        # Return single or multiple results
        if len(departments) == 1:
            return {"Column": departments[0]}
        else:
            return {"Column": departments}

    except Exception as e:
        print(f"Internal server error: {e}")
        raise HTTPException(status_code=500, detail=str(e))

# Only run with: python script.py
if __name__ == "__main__":
    uvicorn.run(app, host="127.0.0.1", port=8000)