In [1]:
# 📘 Train IT Helpdesk Ticket Classifier

import pandas as pd
import random
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
import joblib
import os
from sklearn.linear_model import LogisticRegression
import re
import openai

In [2]:
#!pip install datasets
import pandas as pd

df = pd.read_csv("C:/Users/loyd/Desktop/projects/IT desk management system/it_helpdesk_classifier/data/it_service_ticket.csv")

df.head()


Unnamed: 0,Document,Topic_group
0,connection with icon icon dear please setup ic...,Hardware
1,work experience user work experience user hi w...,Access
2,requesting for meeting requesting meeting hi p...,Hardware
3,reset passwords for external accounts re expir...,Access
4,mail verification warning hi has got attached ...,Miscellaneous


In [3]:

# Fix column names
df.rename(columns={"Document": "issue", "Topic_group": "category"}, inplace=True)

In [4]:
# Drop rows with missing values
df.dropna(subset=["issue", "category"], inplace=True)

In [5]:
# Clean text
def clean_text(text):
    text = text.lower()
    text = re.sub(r"\d+", "", text)  # remove numbers
    text = re.sub(r"\W+", " ", text)  # remove special characters
    text = re.sub(r"\s+", " ", text).strip()  # remove extra spaces
    return text
df["issue"] = df["issue"].str.strip().str.lower()
df["category"] = df["category"].str.strip()

# Preview
df.head()

Unnamed: 0,issue,category
0,connection with icon icon dear please setup ic...,Hardware
1,work experience user work experience user hi w...,Access
2,requesting for meeting requesting meeting hi p...,Hardware
3,reset passwords for external accounts re expir...,Access
4,mail verification warning hi has got attached ...,Miscellaneous


In [6]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df["label_id"] = le.fit_transform(df["category"])

In [7]:
X = df["issue"]
y = df["category"]

In [8]:
# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
from datasets import Dataset

# Prepare Hugging Face dataset
train_data = Dataset.from_pandas(pd.DataFrame({"text": X_train, "label": y_train}))
test_data = Dataset.from_pandas(pd.DataFrame({"text": X_test, "label": y_test}))

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
# Tokenizer function
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

train_data = train_data.map(tokenize, batched=True)
test_data = test_data.map(tokenize, batched=True)

Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 38269/38269 [00:08<00:00, 4373.67 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9568/9568 [00:02<00:00, 4581.25 examples/s]


In [11]:
train_data = train_data.map(tokenize, batched=True)
test_data = test_data.map(tokenize, batched=True)

Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 38269/38269 [00:08<00:00, 4403.42 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9568/9568 [00:02<00:00, 3885.02 examples/s]


In [12]:
# 6. Vectorize using TF-IDF (this converts text to numerical features)
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [None]:

#. Create pipeline: TF-IDF + Naive Bayes
model = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', max_features=5000)),
    ('nb', MultinomialNB())
])

In [None]:
# 4. Vectorize text
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [None]:
model.fit(X_train, y_train)

In [None]:
# 5. Evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:

new_text = ["My VPN is not working and I cannot connect to Outlook."]
prediction = model.predict(new_text)
print("Predicted category:", prediction[0])

In [None]:
# 6. Save Model
os.makedirs("../models", exist_ok=True)
joblib.dump(model, "../models/ticket_classifier.pkl")
joblib.dump(vectorizer, "../models/ticket_vectorizer.pkl")

In [None]:
# 7. Predict Example
sample = ["Wi-Fi keeps disconnecting when I use Zoom"]
predicted = model.predict(sample)
print(f"\n🧠 Prediction for '{sample[0]}': {predicted[0]}")