In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/it-service-ticket-classification-dataset/all_tickets_processed_improved_v3.csv


# Loading data

In [2]:
import pandas as pd
import numpy as np

In [3]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("adisongoh/it-service-ticket-classification-dataset")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/it-service-ticket-classification-dataset


In [4]:
#reading df
df = pd.read_csv('/kaggle/input/it-service-ticket-classification-dataset/all_tickets_processed_improved_v3.csv')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47837 entries, 0 to 47836
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Document     47837 non-null  object
 1   Topic_group  47837 non-null  object
dtypes: object(2)
memory usage: 747.6+ KB


In [6]:
#quick preview of the df
df.head()

Unnamed: 0,Document,Topic_group
0,connection with icon icon dear please setup ic...,Hardware
1,work experience user work experience user hi w...,Access
2,requesting for meeting requesting meeting hi p...,Hardware
3,reset passwords for external accounts re expir...,Access
4,mail verification warning hi has got attached ...,Miscellaneous


In [7]:
#viewing unique values (categories) in Topic_group
df['Topic_group'].nunique()
df['Topic_group'].unique()

array(['Hardware', 'Access', 'Miscellaneous', 'HR Support', 'Purchase',
       'Administrative rights', 'Storage', 'Internal Project'],
      dtype=object)

In [8]:
#checking na
df.isnull().sum()

Document       0
Topic_group    0
dtype: int64

In [9]:
#checking duplicates
df.duplicated().sum()

0

# Text Preprocessing

In [10]:
import re
import string
from bs4 import BeautifulSoup

In [11]:
def cleaned_text (text):
    text = text.lower() #converting to lower case
    text = re.sub(r'\d+', '', text) #removing numbers
    text = text.translate(str.maketrans('', '', string.punctuation)) 
    #removing punctuation
    text = re.sub(r'\W', ' ', text) #removing special characters
    text = BeautifulSoup(text, "html.parser").get_text() #removing HTML tags
    return text

In [12]:
df['Cleaned_Document'] = df['Document'].apply(cleaned_text)

In [13]:
import nltk

#specifying the path to the already downloaded punkt dataset and other packages
nltk.data.path.append('/kaggle/input/nltk-datasets/nltk_data')

In [14]:
from nltk.tokenize import word_tokenize

In [15]:
#splitting text into words
df['Cleaned_Document'] = df['Cleaned_Document'].apply(word_tokenize)

In [16]:
df['Cleaned_Document']

0        [connection, with, icon, icon, dear, please, s...
1        [work, experience, user, work, experience, use...
2        [requesting, for, meeting, requesting, meeting...
3        [reset, passwords, for, external, accounts, re...
                               ...                        
47832    [git, space, for, a, project, issues, with, ad...
47833    [error, sent, july, error, hi, guys, can, you,...
47834    [connection, issues, sent, tuesday, july, conn...
47835    [error, cube, reports, sent, tuesday, july, er...
47836    [running, out, on, extensions, hello, please, ...
Name: Cleaned_Document, Length: 47837, dtype: object

In [17]:
#removing stop-words
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
print(len(stop_words)) #checking that stop-words are downloaded

198


In [18]:
df['Cleaned_Document'] = df['Cleaned_Document'].apply(
    lambda toks: [w for w in toks if w not in stop_words]
)

In [19]:
df['Cleaned_Document']

0        [connection, icon, icon, dear, please, setup, ...
1        [work, experience, user, work, experience, use...
2        [requesting, meeting, requesting, meeting, hi,...
3        [reset, passwords, external, accounts, expire,...
                               ...                        
47832    [git, space, project, issues, adding, users, s...
47833    [error, sent, july, error, hi, guys, help, err...
47834    [connection, issues, sent, tuesday, july, conn...
47835    [error, cube, reports, sent, tuesday, july, er...
47836    [running, extensions, hello, please, advised, ...
Name: Cleaned_Document, Length: 47837, dtype: object

In [20]:
#lemmatization
from nltk.stem import PorterStemmer, WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
df['Cleaned_Document'] = df['Cleaned_Document'].apply(lambda doc: [lemmatizer.lemmatize(word) for word in doc])

In [21]:
df['Cleaned_Document'].head(20)

0     [connection, icon, icon, dear, please, setup, ...
1     [work, experience, user, work, experience, use...
2     [requesting, meeting, requesting, meeting, hi,...
3     [reset, password, external, account, expire, d...
5     [mail, please, dear, look, blacklisted, receiv...
6     [prod, server, tunneling, prod, tunneling, va,...
7     [access, request, dear, module, report, report...
8     [reset, password, client, password, client, de...
9     [direct, report, missing, time, please, action...
10    [laptop, connected, request, rebuild, connecte...
11    [device, recovery, report, printer, alert, dup...
12            [new, starter, hello, please, fill, date]
13    [visual, studio, license, visual, studio, lice...
14    [system, hello, movement, left, available, dev...
15    [access, secondary, secondary, hi, please, pro...
16    [list, sent, copy, copy, hi, receive, copy, be...
17    [new, purchase, po, purchase, po, dear, purcha...
18    [invitation, cloud, strategy, workshop, fe

In [22]:
df['Cleaned_Document'].tail(20)

47817    [submit, expense, even, though, added, electro...
47818    [job, referral, available, sent, thursday, jul...
47819    [oracle, update, another, bug, sent, thursday,...
47820    [social, error, sent, thursday, july, error, d...
47821    [backup, server, performance, issue, sent, thu...
47822    [approving, expense, project, code, related, w...
47823    [bug, sent, wednesday, july, bug, hello, disco...
47824    [oracle, fusion, working, message, unplanned, ...
47825    [issue, sent, friday, july, hi, issue, somethi...
47826    [oracle, error, sent, thursday, july, error, h...
47827    [bandwidth, problem, sent, thursday, july, ban...
47828    [password, expire, day, sent, july, password, ...
47829    [sensitive, information, access, sent, wednesd...
47830    [malfunction, sent, tuesday, july, malfunction...
47831    [outlook, account, laptop, laptop, sa, sa, si,...
47832    [git, space, project, issue, adding, user, sen...
47833    [error, sent, july, error, hi, guy, help, erro.

As I can see, besides stop words, the corpus contains many words like 'please', 'hi', 'hello', etc. These words can affect the efficiency of the algorithm, so I would remove them manually.

In [23]:
custom_stopwords = ['hi', 'hello', 'dear', 'please', 'help', 'guy', 'guys',
                   'thanks', 'thank', 'regards', 'best', 'hey', 'appreciate',
                    'sorry', 'welcome', 'cheers', 'hello', 'goodbye', 'bye',
                    'kindly', 'gladly']

df['Cleaned_Document'] = df['Cleaned_Document'].apply(
    lambda doc: [word for word in doc if word not in custom_stopwords]
)

In [24]:
df['Cleaned_Document']

0        [connection, icon, icon, setup, icon, per, ico...
1        [work, experience, user, work, experience, use...
2        [requesting, meeting, requesting, meeting, fol...
3        [reset, password, external, account, expire, d...
                               ...                        
47832    [git, space, project, issue, adding, user, sen...
47833    [error, sent, july, error, error, appearing, w...
47834    [connection, issue, sent, tuesday, july, conne...
47835    [error, cube, report, sent, tuesday, july, err...
47836    [running, extension, advised, running, extensi...
Name: Cleaned_Document, Length: 47837, dtype: object

In [25]:
#%pip install contractions
#import contractions

contraction_map = {
    "aren't": "are not",
    "can't": "cannot",
    "couldn't": "could not",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'll": "he will",
    "he's": "he is",
    "i'd": "i would",
    "i'll": "i will",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it's": "it is",
    "let's": "let us",
    "mightn't": "might not",
    "mustn't": "must not",
    "shan't": "shall not",
    "she'd": "she would",
    "she'll": "she will",
    "she's": "she is",
    "shouldn't": "should not",
    "that's": "that is",
    "there's": "there is",
    "they'd": "they would",
    "they'll": "they will",
    "they're": "they are",
    "they've": "they have",
    "we'd": "we would",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "where's": "where is",
    "who's": "who is",
    "who'll": "who will",
    "won't": "will not",
    "wouldn't": "would not",
    "you'd": "you would",
    "you'll": "you will",
    "you're": "you are",
    "you've": "you have"
}

def expand_contractions_tokenized(doc):
    return [contraction_map.get(word, word) for word in doc]

df['Cleaned_Document'] = df['Cleaned_Document'].apply(expand_contractions_tokenized)

In [26]:
df['Cleaned_Document'] 

0        [connection, icon, icon, setup, icon, per, ico...
1        [work, experience, user, work, experience, use...
2        [requesting, meeting, requesting, meeting, fol...
3        [reset, password, external, account, expire, d...
                               ...                        
47832    [git, space, project, issue, adding, user, sen...
47833    [error, sent, july, error, error, appearing, w...
47834    [connection, issue, sent, tuesday, july, conne...
47835    [error, cube, report, sent, tuesday, july, err...
47836    [running, extension, advised, running, extensi...
Name: Cleaned_Document, Length: 47837, dtype: object

In [27]:
#saving dataset
df['Cleaned_Document_Str'] = df['Cleaned_Document'].apply(lambda tokens: ' '.join(tokens))
df.to_csv('cleaned_dataset.csv', index=False)

# Classification

In [28]:
df.head()

Unnamed: 0,Document,Topic_group,Cleaned_Document,Cleaned_Document_Str
0,connection with icon icon dear please setup ic...,Hardware,"[connection, icon, icon, setup, icon, per, ico...",connection icon icon setup icon per icon engin...
1,work experience user work experience user hi w...,Access,"[work, experience, user, work, experience, use...",work experience user work experience user work...
2,requesting for meeting requesting meeting hi p...,Hardware,"[requesting, meeting, requesting, meeting, fol...",requesting meeting requesting meeting follow e...
3,reset passwords for external accounts re expir...,Access,"[reset, password, external, account, expire, d...",reset password external account expire day ask...
4,mail verification warning hi has got attached ...,Miscellaneous,"[mail, verification, warning, got, attached, a...",mail verification warning got attached address...


In [29]:
df['Topic_group'].unique()

array(['Hardware', 'Access', 'Miscellaneous', 'HR Support', 'Purchase',
       'Administrative rights', 'Storage', 'Internal Project'],
      dtype=object)

## Logistic Regression

In [30]:
#TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df=5, max_df=0.8, ngram_range=(1,2))
X = vectorizer.fit_transform(df['Cleaned_Document_Str'])

In [31]:
#encoding target variable
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(df['Topic_group'])

In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.92      0.87      0.89      1455
           1       0.92      0.63      0.75       342
           2       0.85      0.85      0.85      2107
           3       0.79      0.89      0.84      2760
           4       0.93      0.77      0.84       451
           5       0.82      0.84      0.83      1400
           6       0.98      0.87      0.92       497
           7       0.93      0.82      0.87       556

    accuracy                           0.85      9568
   macro avg       0.89      0.82      0.85      9568
weighted avg       0.86      0.85      0.85      9568



## Decision Trees / Random Forest / XGBoost:

## RNN

## BERT