In [1]:
import pandas as pd
import numpy as np
import re
import string
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from scipy.sparse import hstack
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

df = pd.read_excel("../data/ai_dev_assignment_tickets_complex_1000.xls")
df['urgency_level'].value_counts()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\thesh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\thesh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\thesh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


urgency_level
High      330
Medium    319
Low       299
Name: count, dtype: int64

In [2]:
stop_words = set(stopwords.words('english'))
lemma = WordNetLemmatizer()

def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    tokens = word_tokenize(text)
    tokens = [lemma.lemmatize(w) for w in tokens if w not in stop_words]
    return ' '.join(tokens)

df['clean_text'] = df['ticket_text'].apply(preprocess_text)

df.dropna(subset=['issue_type', 'urgency_level'], inplace=True)

def detect_urgency(text):
    text = str(text).lower()
    keywords = [
        'urgent', 'immediately', 'asap', 'now', 'today', 'soon',
        'as soon as possible', 'need it now', 'right away',
        'quickly', 'replace immediately', 'emergency'
    ]
    return int(any(kw in text for kw in keywords))


In [3]:
issue_encoder = LabelEncoder()
urgency_encoder = LabelEncoder()

df['issue_label'] = issue_encoder.fit_transform(df['issue_type'])
df['urgency_label'] = urgency_encoder.fit_transform(df['urgency_level'])

df['ticket_length'] = df['clean_text'].apply(lambda x: len(x.split()))
df['has_negative_word'] = df['clean_text'].apply(lambda x: 1 if any(w in x for w in ['broken', 'error', 'late', 'refund']) else 0)
df['is_urgent_keyword'] = df['ticket_text'].apply(detect_urgency)
df['word_count'] = df['clean_text'].apply(lambda x: len(str(x).split()))

In [4]:
tfidf = TfidfVectorizer(max_features=1500, ngram_range=(1, 2))
X_tfidf = tfidf.fit_transform(df['clean_text'])
X_extra = df[['ticket_length', 'has_negative_word', 'is_urgent_keyword', 'word_count']].values
X_combined = hstack([X_tfidf, X_extra])

X_train, X_test, y_train, y_test = train_test_split(X_combined, df[['issue_label', 'urgency_label']], test_size=0.2, random_state=42)

y_issue_train = y_train['issue_label']
y_issue_test = y_test['issue_label']
y_urgency_train = y_train['urgency_label']
y_urgency_test = y_test['urgency_label']

issue_model = RandomForestClassifier(n_estimators=100, random_state=42)
urgency_model = RandomForestClassifier(n_estimators=100, random_state=42)

issue_model.fit(X_train, y_issue_train)
urgency_model.fit(X_train, y_urgency_train)

In [5]:
# STEP 6: Evaluation

print("Issue Type Classification Report")
print(classification_report(y_issue_test, issue_model.predict(X_test)))

print("Urgency Level Classification Report")
print(classification_report(y_urgency_test, urgency_model.predict(X_test)))


Issue Type Classification Report
              precision    recall  f1-score   support

           0       1.00      0.97      0.99        35
           1       1.00      0.91      0.95        32
           2       0.74      1.00      0.85        29
           3       1.00      0.92      0.96        24
           4       1.00      0.90      0.95        20
           5       1.00      0.95      0.97        19
           6       1.00      0.94      0.97        17

    accuracy                           0.94       176
   macro avg       0.96      0.94      0.95       176
weighted avg       0.96      0.94      0.95       176

Urgency Level Classification Report
              precision    recall  f1-score   support

           0       0.30      0.28      0.29        60
           1       0.24      0.24      0.24        54
           2       0.34      0.35      0.35        62

    accuracy                           0.30       176
   macro avg       0.29      0.29      0.29       176
weighted

In [6]:
# STEP 7: Save Models
with open('../models/issue_type_model.pkl', 'wb') as f:
    pickle.dump(issue_model, f)

with open('../models/urgency_level_model.pkl', 'wb') as f:
    pickle.dump(urgency_model, f)

with open('../models/tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

with open('../models/issue_encoder.pkl', 'wb') as f:
    pickle.dump(issue_encoder, f)

with open('../models/urgency_encoder.pkl', 'wb') as f:
    pickle.dump(urgency_encoder, f)

In [8]:
def extract_entities(text):
    entities = {}
    text = text.lower()
    entities['products'] = re.findall(r"\b(?:laptop|phone|charger|tablet)\b", text)
    entities['dates'] = re.findall(r"\b(?:\d{1,2}/\d{1,2}/\d{2,4})\b", text)
    entities['complaint_keywords'] = [word for word in ['broken', 'late', 'error', 'refund'] if word in text]
    return entities

def process_ticket(ticket_text):
    clean = preprocess_text(ticket_text)
    tfidf_feat = tfidf.transform([clean])
    extra_feat = np.array([[len(clean.split()), int(any(w in clean for w in ['broken', 'error', 'late', 'refund'])), detect_urgency(ticket_text), len(clean.split())]])
    final_feat = hstack([tfidf_feat, extra_feat])

    issue_pred = issue_encoder.inverse_transform(issue_model.predict(final_feat))[0]
    urgency_pred = urgency_encoder.inverse_transform(urgency_model.predict(final_feat))[0]
    entities = extract_entities(ticket_text)

    print("TF-IDF shape:", tfidf_feat.shape)
    print("Extra feature shape:", extra_feat.shape)
    print("Combined shape:", final_feat.shape)

    return {
        "issue_type": issue_pred,
        "urgency_level": urgency_pred,
        "extracted_entities": entities
    }

# üîç Test Example
test_input = "My phone stopped working and I need a refund. It was delivered late."
print(process_ticket(test_input))



TF-IDF shape: (1, 549)
Extra feature shape: (1, 4)
Combined shape: (1, 553)
{'issue_type': 'Product Defect', 'urgency_level': 'Low', 'extracted_entities': {'products': ['phone'], 'dates': [], 'complaint_keywords': ['late', 'refund']}}
