In [1]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd
import numpy as np

In [2]:
file_path = "ai_dev_assignment_tickets_complex_1000.xls"
df = pd.read_excel(file_path)
print(df.shape)
df.head()

(1000, 5)


Unnamed: 0,ticket_id,ticket_text,issue_type,urgency_level,product
0,1,Payment issue for my SmartWatch V2. I was unde...,Billing Problem,Medium,SmartWatch V2
1,2,Can you tell me more about the UltraClean Vacu...,General Inquiry,,UltraClean Vacuum
2,3,I ordered SoundWave 300 but got EcoBreeze AC i...,Wrong Item,Medium,SoundWave 300
3,4,Facing installation issue with PhotoSnap Cam. ...,Installation Issue,Low,PhotoSnap Cam
4,5,Order #30903 for Vision LED TV is 13 days late...,Late Delivery,,Vision LED TV


## Data preprocessing

In [3]:
from preprocessing import preprocess_data
df = preprocess_data(file_path)
df

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\raunak\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\raunak\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,ticket_id,ticket_text,issue_type,urgency_level,product,cleaned_text
0,1,Payment issue for my SmartWatch V2. I was unde...,Billing Problem,Medium,SmartWatch V2,payment issue smartwatch v2 underbilled order ...
1,3,I ordered SoundWave 300 but got EcoBreeze AC i...,Wrong Item,Medium,SoundWave 300,ordered soundwave three hundred got ecobreeze ...
2,4,Facing installation issue with PhotoSnap Cam. ...,Installation Issue,Low,PhotoSnap Cam,facing installation issue photosnap cam setup ...
3,6,Can you tell me more about the PhotoSnap Cam w...,General Inquiry,Medium,PhotoSnap Cam,tell photosnap cam warranty also available red
4,7,is malfunction. It stopped working after just...,Product Defect,Low,EcoBreeze AC,malfunction stopped working seven day
...,...,...,...,...,...,...
821,995,Is this item in stock?,General Inquiry,High,RoboChef Blender,item stock
822,996,I ordered EcoBreeze AC but got FitRun Treadmil...,Wrong Item,High,EcoBreeze AC,ordered ecobreeze ac got fitrun treadmill inst...
823,997,I ordered SoundWave 300 but got PowerMax Batte...,Wrong Item,Low,SoundWave 300,ordered soundwave three hundred got powermax b...
824,999,Payment issue fr mi SoundWave 300. I was debit...,Billing Problem,Low,SoundWave 300,payment issue fr mi soundwave 300 debited inco...


### Feature Engineering

In [4]:
from feature_engineering import extract_features
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X, tfidf_vectorizer = extract_features(df, vectorizer)
print(X.shape)
tfidf_vectorizer

[[ 0.  0.  0. ...  0.  0. 12.]
 [ 0.  0.  0. ...  0.  0. 14.]
 [ 0.  0.  0. ...  0.  0. 11.]
 ...
 [ 0.  0.  0. ...  0.  0. 25.]
 [ 0.  0.  0. ...  0.  0. 13.]
 [ 0.  0.  0. ...  0.  0. 11.]] TfidfVectorizer()
(826, 482)


In [7]:
df.urgency_level.unique()

array(['Medium', 'Low', 'High'], dtype=object)

In [8]:
df.issue_type.unique()

array(['Billing Problem', 'Wrong Item', 'Installation Issue',
       'General Inquiry', 'Product Defect', 'Late Delivery',
       'Account Access'], dtype=object)

In [9]:
y_issue_type = df["issue_type"]
y_urgency = df["urgency_level"]

### Label Encoder

In [10]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder = LabelEncoder()

y_issue_type_encoded = label_encoder.fit_transform(y_issue_type)
y_urgency_encoded = label_encoder.fit_transform(y_urgency)

In [11]:
df.columns

Index(['ticket_id', 'ticket_text', 'issue_type', 'urgency_level', 'product',
       'cleaned_text'],
      dtype='object')

In [12]:
X_train, X_test, y_issue_train, y_issue_test, y_urgency_train, y_urgency_test = train_test_split(
    X, y_issue_type_encoded, y_urgency_encoded, test_size=0.25, random_state=100
)

In [13]:
issue_model = LogisticRegression(random_state=50)
issue_model.fit(X_train, y_issue_train)

urgency_model = RandomForestClassifier(random_state=50)
urgency_model.fit(X_train, y_urgency_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [14]:
y_issue_predict = issue_model.predict(X_test)
print(accuracy_score(y_issue_predict, y_issue_test))
print(classification_report(y_issue_predict, y_issue_test))

1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        44
           1       1.00      1.00      1.00        31
           2       1.00      1.00      1.00        30
           3       1.00      1.00      1.00        27
           4       1.00      1.00      1.00        19
           5       1.00      1.00      1.00        30
           6       1.00      1.00      1.00        26

    accuracy                           1.00       207
   macro avg       1.00      1.00      1.00       207
weighted avg       1.00      1.00      1.00       207



In [15]:
y_urgency_predict = urgency_model.predict(X_test)
print(accuracy_score(y_urgency_predict, y_urgency_test))
print(classification_report(y_urgency_predict, y_urgency_test))

0.28502415458937197
              precision    recall  f1-score   support

           0       0.42      0.31      0.36        91
           1       0.10      0.14      0.12        44
           2       0.30      0.35      0.32        72

    accuracy                           0.29       207
   macro avg       0.28      0.26      0.27       207
weighted avg       0.31      0.29      0.29       207



In [16]:
# Save the models and vectorizer
import joblib
joblib.dump(issue_model, 'models/issue_model.joblib')
joblib.dump(urgency_model, 'models/urgency_model.joblib')
joblib.dump(tfidf_vectorizer, 'models/tfidf_vectorizer.joblib')

print("\nModels and vectorizer have been saved in the 'models' directory.")


Models and vectorizer have been saved in the 'models' directory.


In [17]:
print(df['cleaned_text'].loc[:10])

0     payment issue smartwatch v2 underbilled order ...
1     ordered soundwave three hundred got ecobreeze ...
2     facing installation issue photosnap cam setup ...
3        tell photosnap cam warranty also available red
4                 malfunction stopped working seven day
5     facing installation issue robochef blender set...
6               order 34285 placed eighteen march still
7     powermax battery ecobreeze ac lost giving issu...
8     order 53356 robochef blender eighteen day late...
9     order 49712 robochef blender four day late ord...
10    received wrong product order mixed also contac...
Name: cleaned_text, dtype: object


###  Entity Extraction

In [18]:
df.columns

Index(['ticket_id', 'ticket_text', 'issue_type', 'urgency_level', 'product',
       'cleaned_text'],
      dtype='object')

In [19]:
df['issue_type'].unique()


array(['Billing Problem', 'Wrong Item', 'Installation Issue',
       'General Inquiry', 'Product Defect', 'Late Delivery',
       'Account Access'], dtype=object)

In [20]:
df['product'].unique()

array(['SmartWatch V2', 'SoundWave 300', 'PhotoSnap Cam', 'EcoBreeze AC',
       'RoboChef Blender', 'PowerMax Battery', 'Vision LED TV',
       'ProTab X1', 'FitRun Treadmill', 'UltraClean Vacuum'], dtype=object)

In [21]:
import re
#convert np array into python list
product_list = df['product'].unique().tolist()
print(product_list)


['SmartWatch V2', 'SoundWave 300', 'PhotoSnap Cam', 'EcoBreeze AC', 'RoboChef Blender', 'PowerMax Battery', 'Vision LED TV', 'ProTab X1', 'FitRun Treadmill', 'UltraClean Vacuum']


In [22]:
import entity_extraction
df["extracted_entities"] = df["cleaned_text"].apply(lambda text: entity_extraction.
                                                extract_entity(text, product_list))
df['extracted_entities'][:10]

0    {'product': 'SmartWatch V2', 'complaint_keywor...
1    {'product': 'EcoBreeze AC', 'complaint_keyword...
2    {'product': 'PhotoSnap Cam', 'complaint_keywor...
3    {'product': 'PhotoSnap Cam', 'complaint_keywor...
4    {'product': None, 'complaint_keywords': ['stop...
5    {'product': 'RoboChef Blender', 'complaint_key...
6    {'product': None, 'complaint_keywords': [], 'd...
7    {'product': 'EcoBreeze AC', 'complaint_keyword...
8    {'product': 'RoboChef Blender', 'complaint_key...
9    {'product': 'RoboChef Blender', 'complaint_key...
Name: extracted_entities, dtype: object

In [23]:
df.columns

Index(['ticket_id', 'ticket_text', 'issue_type', 'urgency_level', 'product',
       'cleaned_text', 'extracted_entities'],
      dtype='object')

In [24]:
import numpy as np
from entity_extraction import extract_entity
from feature_engineering import extract_features # assumes it returns final feature array
from preprocessing import preprocess_data

def predict_ticket(raw_text, issue_model, urgency_model, tfidf_vectorizer, product_list):
    # Create a temporary dataframe with the raw text
    temp_df = pd.DataFrame({'ticket_text': [raw_text]})
    
    # Preprocess the data using the correct function parameters
    # Note: preprocess_data expects a file path, so we need to modify it for single text
    temp_df['cleaned_text'] = temp_df['ticket_text'].apply(lambda x: clean_text(x))
    
    # Extract features with the correct parameters
    features, _ = extract_features(temp_df)
    
    # Predict issue type and urgency level
    issue_type = issue_model.predict(features)[0]
    urgency_level = urgency_model.predict(features)[0]
    
    # Extract entities
    entities = extract_entity(raw_text, product_list)
    
    # Return output
    return {
        "issue_type": issue_type,
        "urgency_level": urgency_level,
        "entities": entities
    }

In [25]:
from joblib import load

# Load models and vectorizer
issue_model = load("models/issue_model.joblib")
urgency_model = load("models/urgency_model.joblib")
tfidf_vectorizer = load("models/tfidf_vectorizer.joblib")

# Prepare product list
product_list = [
    'SmartWatch V2', 'SoundWave 300', 'PhotoSnap Cam', 'EcoBreeze AC',
    'RoboChef Blender', 'PowerMax Battery', 'Vision LED TV',
    'ProTab X1', 'FitRun Treadmill', 'UltraClean Vacuum'
]


In [26]:
def predict_ticket_gradio(ticket_text, tfidf_vectorizer):
    temp_df = pd.DataFrame({'cleaned_text': [ticket_text]})
    features = extract_features(temp_df, tfidf_vectorizer)
    issue_type = issue_model.predict(features)[0]
    urgency_level = urgency_model.predict(features)[0]
    return {"Issue": issue_type, "Urgency": urgency_level}