In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

df = pd.read_csv("../data/processed/tickets_clean.csv")

X = df["ticket_text"]
y = df["category"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

tfidf = TfidfVectorizer(
    stop_words="english",
    max_features=5000,
    ngram_range=(1,2)
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [5]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report

best_C =1

final_model = LinearSVC(C=best_C)
final_model.fit(X_train_tfidf, y_train)

final_pred = final_model.predict(X_test_tfidf)

print("Final Accuracy:", accuracy_score(y_test, final_pred))
print(classification_report(y_test, final_pred))

Final Accuracy: 0.9531823745410036
              precision    recall  f1-score   support

     Account       0.97      0.72      0.83        43
     Billing       0.97      0.96      0.96       184
     General       0.93      0.94      0.93      1024
   Technical       0.96      0.97      0.97      2017

    accuracy                           0.95      3268
   macro avg       0.96      0.90      0.92      3268
weighted avg       0.95      0.95      0.95      3268



In [7]:
import joblib
from pathlib import Path

Path("../models").mkdir(exist_ok=True)

joblib.dump(tfidf,"../models/tfidf.joblib")
joblib.dump(final_model,"../models/ticket_classifier.joblib")

['../models/ticket_classifier.joblib']

In [8]:
loaded_tfidf = joblib.load("../models/tfidf.joblib")
loaded_model = joblib.load("../models/ticket_classifier.joblib")

sample_text = [
"Unable to login after password reset"
]

sample_vec = loaded_tfidf.transform(sample_text)
prediction = loaded_model.predict(sample_vec)

prediction

array(['Account'], dtype=object)