### STEP 1: Import Libraries & Load Data

In [2]:
# Install if not already
!pip install textblob

# Imports
import pandas as pd
import numpy as np
import re
import string

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob


Collecting textblob
  Using cached textblob-0.19.0-py3-none-any.whl.metadata (4.4 kB)
Using cached textblob-0.19.0-py3-none-any.whl (624 kB)
Installing collected packages: textblob
Successfully installed textblob-0.19.0


###  Step 2: Download Required NLTK Data

In [3]:
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ishik\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ishik\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Step 3: Load Your Dataset

In [4]:
df = pd.read_excel("ai_dev_assignment_tickets_complex_1000.xlsx")
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)


###  STEP 4: Handle Missing Values

In [5]:
# Check for missing values
print(df.isnull().sum())

# Option 1: Drop rows with any missing values (if small in number)
df = df.dropna()

# Reset index
df.reset_index(drop=True, inplace=True)

# Confirm removal
print(f"Remaining rows: {df.shape[0]}")


ticket_id        0
ticket_text      0
issue_type       0
urgency_level    0
product          0
dtype: int64
Remaining rows: 826


### STEP 5: Text Cleaning Function

In [6]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r"\d+", '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df['clean_text'] = df['ticket_text'].apply(clean_text)


### STEP 6: Feature Engineering

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize vectorizer
tfidf = TfidfVectorizer(max_features=300)  # you can increase if needed

# Fit and transform the text
tfidf_matrix = tfidf.fit_transform(df['clean_text'])

# Convert to DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())


### 2. Ticket Length Feature

In [9]:
df['ticket_length'] = df['clean_text'].apply(lambda x: len(x.split()))


###  3. Sentiment Score (using TextBlob)

In [10]:
pip install textblob


Note: you may need to restart the kernel to use updated packages.


In [11]:
from textblob import TextBlob

def get_sentiment(text):
    return TextBlob(text).sentiment.polarity

df['sentiment_score'] = df['clean_text'].apply(get_sentiment)


In [12]:
# Combine tfidf with custom features
X = pd.concat([tfidf_df, df[['ticket_length', 'sentiment_score']].reset_index(drop=True)], axis=1)

# Target variables
y_issue = df['issue_type']
y_urgency = df['urgency_level']


### Step 6.1: Train-Test Split

In [13]:
from sklearn.model_selection import train_test_split

# For issue_type
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X, y_issue, test_size=0.2, random_state=42)

# For urgency_level
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X, y_urgency, test_size=0.2, random_state=42)


###  Step 6.2: Train Logistic Regression Models

In [14]:
from sklearn.linear_model import LogisticRegression

# Issue Type Classifier
model_issue = LogisticRegression(max_iter=1000)
model_issue.fit(X_train_1, y_train_1)

# Urgency Level Classifier
model_urgency = LogisticRegression(max_iter=1000)
model_urgency.fit(X_train_2, y_train_2)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


###  Step 6.3: Evaluate Both Models

In [15]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# ISSUE TYPE
print("Issue Type Classification Report:")
y_pred_1 = model_issue.predict(X_test_1)
print(classification_report(y_test_1, y_pred_1))
print("Confusion Matrix:\n", confusion_matrix(y_test_1, y_pred_1))
print("Accuracy:", accuracy_score(y_test_1, y_pred_1))

# URGENCY LEVEL
print("\nUrgency Level Classification Report:")
y_pred_2 = model_urgency.predict(X_test_2)
print(classification_report(y_test_2, y_pred_2))
print("Confusion Matrix:\n", confusion_matrix(y_test_2, y_pred_2))
print("Accuracy:", accuracy_score(y_test_2, y_pred_2))


Issue Type Classification Report:
                    precision    recall  f1-score   support

    Account Access       1.00      1.00      1.00        23
   Billing Problem       1.00      1.00      1.00        19
   General Inquiry       1.00      1.00      1.00        25
Installation Issue       1.00      1.00      1.00        29
     Late Delivery       1.00      1.00      1.00        17
    Product Defect       1.00      1.00      1.00        30
        Wrong Item       1.00      1.00      1.00        23

          accuracy                           1.00       166
         macro avg       1.00      1.00      1.00       166
      weighted avg       1.00      1.00      1.00       166

Confusion Matrix:
 [[23  0  0  0  0  0  0]
 [ 0 19  0  0  0  0  0]
 [ 0  0 25  0  0  0  0]
 [ 0  0  0 29  0  0  0]
 [ 0  0  0  0 17  0  0]
 [ 0  0  0  0  0 30  0]
 [ 0  0  0  0  0  0 23]]
Accuracy: 1.0

Urgency Level Classification Report:
              precision    recall  f1-score   support

        

### Step 7.1: Extract Product Name

In [16]:
# Create a list of known product names
product_list = df['product'].unique().tolist()

def extract_product(text):
    for product in product_list:
        if product.lower() in text.lower():
            return product
    return None


### Step 7.2: Extract Complaint Keywords

In [17]:
complaint_keywords = [
    'broken', 'defective', 'late', 'missing', 'damaged', 'error', 'issue',
    'problem', 'not working', 'delay', 'wrong', 'cancelled', 'failed'
]

def extract_complaints(text):
    found = [word for word in complaint_keywords if word in text.lower()]
    return found if found else None


###  Step 7.3: Extract Dates

In [18]:
import re

def extract_dates(text):
    date_pattern = r'\b(?:\d{1,2}[/-]\d{1,2}[/-]\d{2,4}|\d{1,2}(?:st|nd|rd|th)?\s(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*)\b'
    dates = re.findall(date_pattern, text)
    return dates if dates else None


### Step 7.4: Combine into One Function

In [19]:
def extract_entities(text):
    return {
        'product': extract_product(text),
        'dates': extract_dates(text),
        'complaints': extract_complaints(text)
    }


In [26]:
df['entities'] = df['ticket_text'].apply(extract_entities)
df[['ticket_text', 'entities']].head()
# Extracted product as separate column
df['extracted_product'] = df['ticket_text'].apply(lambda x: extract_product(x))

# Compare to actual (ground truth) product column
df['product_match'] = df['extracted_product'] == df['product']

# Evaluate match rate
match_rate = df['product_match'].mean()
print(f"✅ Product Extraction Accuracy: {match_rate:.2%}")


✅ Product Extraction Accuracy: 56.30%


### Step 8: Final Integration Function

In [27]:
def preprocess_input(text):
    text = clean_text(text)
    ticket_length = len(text.split())
    sentiment = get_sentiment(text)
    
    # TF-IDF vector (same columns as training)
    tfidf_vector = tfidf.transform([text]).toarray()
    full_features = np.append(tfidf_vector, [[ticket_length, sentiment]], axis=1)
    
    return full_features


In [28]:
def analyze_ticket(ticket_text):
    features = preprocess_input(ticket_text)

    # Predictions
    predicted_issue = model_issue.predict(features)[0]
    predicted_urgency = model_urgency.predict(features)[0]
    extracted_entities = extract_entities(ticket_text)

    return {
        "predicted_issue_type": predicted_issue,
        "predicted_urgency_level": predicted_urgency,
        "extracted_entities": extracted_entities
    }


In [29]:
test_text = "Order #19283 for SmartWatch V2 arrived broken and 5 days late. Please fix this issue ASAP!"
result = analyze_ticket(test_text)

import json
print(json.dumps(result, indent=2))


{
  "predicted_issue_type": "Late Delivery",
  "predicted_urgency_level": "Medium",
  "extracted_entities": {
    "product": "SmartWatch V2",
    "dates": null,
    "complaints": [
      "broken",
      "late",
      "issue"
    ]
  }
}




In [30]:
pip install gradio


Note: you may need to restart the kernel to use updated packages.


In [35]:
import gradio as gr
import json

def gradio_interface(text):
    result = analyze_ticket(text)
    entities = result["extracted_entities"]

    return (
        result["predicted_issue_type"],
        result["predicted_urgency_level"],
        entities.get("product", "Not found"),
        json.dumps(entities, indent=2)
    )

demo = gr.Interface(
    fn=gradio_interface,
    inputs=gr.Textbox(lines=4, label="Enter Ticket Text"),
    outputs=[
        gr.Label(label="Predicted Issue Type"),
        gr.Label(label="Predicted Urgency Level"),
        gr.Label(label="Extracted Product Name"),
        gr.Textbox(label="Extracted Entities (Full JSON)")
    ],
    title=" AI-Powered Support Ticket Analyzer",
    description="Paste a raw customer ticket below to classify and extract key insights including product name."
)

demo.launch()


* Running on local URL:  http://127.0.0.1:7865
* To create a public link, set `share=True` in `launch()`.


