In [2]:
import kagglehub

path = kagglehub.dataset_download("adisongoh/it-service-ticket-classification-dataset")

print(path)

C:\Users\hiron\.cache\kagglehub\datasets\adisongoh\it-service-ticket-classification-dataset\versions\1


In [5]:
import pandas as pd
import spacy
import re

df = pd.read_csv("all_tickets_processed_improved_v3.csv")

# if fail python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

# Define a cleaning function
def clean_text(text):
    # remove non letter
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = text.lower()
    doc = nlp(text)
    # lemmatize and remove stopwords/punctuation/single characters
    tokens = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop and len(token) > 1]
    return " ".join(tokens)


df["cleaned_text"] = df["Document"].apply(clean_text)
print(df[["Document", "cleaned_text", "Topic_group"]].head())


                                            Document  \
0  connection with icon icon dear please setup ic...   
1  work experience user work experience user hi w...   
2  requesting for meeting requesting meeting hi p...   
3  reset passwords for external accounts re expir...   

                                        cleaned_text    Topic_group  
0  connection icon icon dear setup icon icon engi...       Hardware  
1  work experience user work experience user hi w...         Access  
2  request meeting request meeting hi help follow...       Hardware  
3  reset password external account expire day hi ...         Access  


In [7]:
texts = df["Document"].fillna("").tolist()

# Batch process with nlp.pipe
cleaned_texts = []
for doc in nlp.pipe(texts, batch_size=50, n_process=4):  # Set n_process >1 if spaCy v3 and multiprocessing is supported
    # Lowercase and remove non-letter characters
    tokens = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop and len(token) > 1]
    cleaned_texts.append(" ".join(tokens))

# Add back to dataframe
df["cleaned_text_bis"] = cleaned_texts

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer


vectorizer = TfidfVectorizer(max_features=5000)  # You can tweak max_features
X = vectorizer.fit_transform(df["cleaned_text"])

# labels
y = df["Topic_group"]

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# train model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# evaluate
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


                       precision    recall  f1-score   support

               Access       0.91      0.87      0.89      1455
Administrative rights       0.88      0.67      0.76       342
           HR Support       0.83      0.82      0.83      2107
             Hardware       0.79      0.88      0.83      2760
     Internal Project       0.91      0.78      0.84       451
        Miscellaneous       0.79      0.83      0.81      1400
             Purchase       0.97      0.88      0.92       497
              Storage       0.92      0.84      0.87       556

             accuracy                           0.84      9568
            macro avg       0.88      0.82      0.84      9568
         weighted avg       0.84      0.84      0.84      9568



In [10]:
import joblib

# save classifier and vectorizer
joblib.dump(clf, "ticket_classifier_api/model/classifier.pkl")
joblib.dump(vectorizer, "ticket_classifier_api/model/vectorizer.pkl")


['ticket_classifier_api/model/vectorizer.pkl']