In [8]:
import pandas as pd

# Load the dataset
df = pd.read_excel("ai_dev_assignment_tickets_complex_1000.xls")

# Preview the data
print("Data Sample:")
print(df.head())

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Summary statistics
print("\nUnique Issue Types:", df['issue_type'].nunique())
print("Issue Type Distribution:\n", df['issue_type'].value_counts())

print("\nUrgency Level Distribution:\n", df['urgency_level'].value_counts())

print("\nSample Ticket Text:\n", df['ticket_text'].iloc[0])


Data Sample:
   ticket_id                                        ticket_text  \
0          1  Payment issue for my SmartWatch V2. I was unde...   
1          2  Can you tell me more about the UltraClean Vacu...   
2          3  I ordered SoundWave 300 but got EcoBreeze AC i...   
3          4  Facing installation issue with PhotoSnap Cam. ...   
4          5  Order #30903 for Vision LED TV is 13 days late...   

           issue_type urgency_level            product  
0     Billing Problem        Medium      SmartWatch V2  
1     General Inquiry           NaN  UltraClean Vacuum  
2          Wrong Item        Medium      SoundWave 300  
3  Installation Issue           Low      PhotoSnap Cam  
4       Late Delivery           NaN      Vision LED TV  

Missing Values:
ticket_id         0
ticket_text      55
issue_type       76
urgency_level    52
product           0
dtype: int64

Unique Issue Types: 7
Issue Type Distribution:
 issue_type
Billing Problem       146
General Inquiry       146


In [9]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK data if not already present
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Drop rows with missing critical fields
df_clean = df.dropna(subset=['ticket_text', 'issue_type', 'urgency_level'])

# Initialize preprocessor components
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize
    tokens = nltk.word_tokenize(text)
    # Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing
df_clean['clean_text'] = df_clean['ticket_text'].apply(preprocess_text)

# Show sample result
print(df_clean[['ticket_text', 'clean_text']].head())


                                         ticket_text  \
0  Payment issue for my SmartWatch V2. I was unde...   
2  I ordered SoundWave 300 but got EcoBreeze AC i...   
3  Facing installation issue with PhotoSnap Cam. ...   
5  Can you tell me more about the PhotoSnap Cam w...   
6   is malfunction. It stopped working after just...   

                                          clean_text  
0       payment issue smartwatch v underbilled order  
2  ordered soundwave got ecobreeze ac instead ord...  
3  facing installation issue photosnap cam setup ...  
5     tell photosnap cam warranty also available red  
6                    malfunction stopped working day  


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/krizzkamaliyagmail.com/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/krizzkamaliyagmail.com/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/krizzkamaliyagmail.com/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['clean_text'] = df_clean['ticket_text'].apply(preprocess_text)


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob

# Vectorize text using TF-IDF
tfidf = TfidfVectorizer(max_features=1000)  # You can change this number
X_tfidf = tfidf.fit_transform(df_clean['clean_text'])

# Add ticket length
df_clean['ticket_length'] = df_clean['clean_text'].apply(lambda x: len(x.split()))

# Add sentiment score
df_clean['sentiment'] = df_clean['ticket_text'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Print sample features
print(df_clean[['ticket_length', 'sentiment']].head())
print("TF-IDF shape:", X_tfidf.shape)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['ticket_length'] = df_clean['clean_text'].apply(lambda x: len(x.split()))


   ticket_length  sentiment
0              6        0.0
2              8        0.0
3              8       -0.5
5              7        0.3
6              4        0.0
TF-IDF shape: (826, 105)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['sentiment'] = df_clean['ticket_text'].apply(lambda x: TextBlob(x).sentiment.polarity)


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from scipy.sparse import hstack

# Features: Combine TF-IDF with numeric features
import numpy as np
X_combined = hstack([X_tfidf, 
                     np.array(df_clean[['ticket_length', 'sentiment']])])

# Labels
y_issue = df_clean['issue_type']
y_urgency = df_clean['urgency_level']

# Split datasets
X_train_i, X_test_i, y_train_i, y_test_i = train_test_split(X_combined, y_issue, test_size=0.2, random_state=42)
X_train_u, X_test_u, y_train_u, y_test_u = train_test_split(X_combined, y_urgency, test_size=0.2, random_state=42)

# Train Issue Type Classifier
model_issue = LogisticRegression(max_iter=1000)
model_issue.fit(X_train_i, y_train_i)
y_pred_i = model_issue.predict(X_test_i)
print("Issue Type Classification Report:\n", classification_report(y_test_i, y_pred_i))

# Train Urgency Level Classifier
model_urgency = LogisticRegression(max_iter=1000)
model_urgency.fit(X_train_u, y_train_u)
y_pred_u = model_urgency.predict(X_test_u)
print("Urgency Level Classification Report:\n", classification_report(y_test_u, y_pred_u))


Issue Type Classification Report:
                     precision    recall  f1-score   support

    Account Access       1.00      1.00      1.00        23
   Billing Problem       1.00      1.00      1.00        19
   General Inquiry       1.00      1.00      1.00        25
Installation Issue       1.00      1.00      1.00        29
     Late Delivery       1.00      1.00      1.00        17
    Product Defect       1.00      1.00      1.00        30
        Wrong Item       1.00      1.00      1.00        23

          accuracy                           1.00       166
         macro avg       1.00      1.00      1.00       166
      weighted avg       1.00      1.00      1.00       166

Urgency Level Classification Report:
               precision    recall  f1-score   support

        High       0.40      0.35      0.37        66
         Low       0.29      0.33      0.31        43
      Medium       0.32      0.33      0.32        57

    accuracy                           0.34   

In [12]:
# Expanded complaint keywords list
complaint_keywords = [
    "broken", "late", "not working", "malfunction", "error", "damaged", 
    "defective", "defect", "faulty", "delay", "issue", "problem", "failed", "missing"
]

# Refined date regex (more natural phrasing, avoids single digits or order #s)
date_pattern = r'\b(?:\d{1,2}[/-]\d{1,2}[/-]\d{2,4}|\d{4}-\d{2}-\d{2}|\w+\s+\d{1,2}(?:st|nd|rd|th)?|\d+\s+days?|yesterday|today|tomorrow)\b'

def extract_entities(text):
    text_lower = text.lower()
    
    # Extract products
    products_found = [p for p in product_list_lower if p in text_lower]
    
    # Extract complaint keywords
    complaints_found = [word for word in complaint_keywords if word in text_lower]
    
    # Extract dates
    dates_found = re.findall(date_pattern, text_lower)
    
    return {
        "products": products_found,
        "complaint_keywords": complaints_found,
        "dates": dates_found
    }

# Test again on same sample
sample_text = "Payment issue for my SmartWatch V2. I was underbilled for order #29224."
print("Sample Text:", sample_text)
print("Extracted Entities:", extract_entities(sample_text))


Sample Text: Payment issue for my SmartWatch V2. I was underbilled for order #29224.


NameError: name 'product_list_lower' is not defined

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Reinitialize and fit the vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=100)
X = tfidf_vectorizer.fit_transform(df_clean['clean_text'])

# Labels
y_issue = df_clean['issue_type']
y_urgency = df_clean['urgency_level']


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Train/test split
X_train, X_test, y_issue_train, y_issue_test = train_test_split(X, y_issue, test_size=0.2, random_state=42)
_, _, y_urgency_train, y_urgency_test = train_test_split(X, y_urgency, test_size=0.2, random_state=42)

# Train models
issue_model = LogisticRegression(max_iter=1000)
issue_model.fit(X_train, y_issue_train)

urgency_model = LogisticRegression(max_iter=1000)
urgency_model.fit(X_train, y_urgency_train)


In [13]:
sample_input = "Payment issue for my SmartWatch V2. I was underbilled for order #29224."
result = predict_ticket_info(sample_input, issue_model, urgency_model, tfidf_vectorizer)
print(result)


NameError: name 'predict_ticket_info' is not defined