<a href="https://colab.research.google.com/github/Kedarlimbalkar/ticket_pred/blob/main/ticket_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split

# --- 1. Define Project Data and Categories ---
# The target variable is 'Category'[cite: 21].
TICKET_CATEGORIES = [
    "Bug Report", "Feature Request", "Technical Issue",
    "Billing Inquiry", "Account Management"
] # Based on Key Ticket Categories [cite: 11, 13, 14, 16, 19]

# Simulate a dataset structure (as per specification [cite: 21])
def create_sample_data(num_samples=1000):
    data = {
        'Ticket ID': range(1, num_samples + 1),
        'Subject': [f"Issue summary {i}" for i in range(num_samples)],
        'Description': [f"Detailed problem description requiring {random.choice(TICKET_CATEGORIES)} assistance." for i in range(num_samples)],
        'Category': [random.choice(TICKET_CATEGORIES) for _ in range(num_samples)],
        'Priority': [random.choice(['High', 'Medium', 'Low']) for _ in range(num_samples)],
        'Timestamp': pd.to_datetime('2023-01-01') + pd.to_timedelta(np.arange(num_samples), unit='h')
    }
    return pd.DataFrame(data)

df = create_sample_data()
print("Sample Data Head:\n", df[['Subject', 'Category']].head())

# Split the data into training and testing sets
X = df['Description'] # Using 'Description' (Long Text) as the primary input feature [cite: 21]
y = df['Category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Sample Data Head:
            Subject            Category
0  Issue summary 0  Account Management
1  Issue summary 1          Bug Report
2  Issue summary 2          Bug Report
3  Issue summary 3     Billing Inquiry
4  Issue summary 4  Account Management


In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

# Download necessary NLTK resources (run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Convert to lowercase and remove non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
    # Tokenization
    words = nltk.word_tokenize(text)
    # Remove stopwords and lemmatize
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return " ".join(words)

# Apply preprocessing to the training data
X_train_processed = X_train.apply(preprocess_text)
X_test_processed = X_test.apply(preprocess_text)

print("\nSample Processed Text:\n", X_train_processed.iloc[0])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.



Sample Processed Text:
 detailed problem description requiring technical issue assistance


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# --- 3a. Build and Train the Model (scikit-learn) ---
# Use a Pipeline to combine vectorizer and classifier
text_classifier = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('clf', LogisticRegression(random_state=42, max_iter=1000))
])

print("\nTraining the model...")
text_classifier.fit(X_train_processed, y_train)

# --- 3b. Prediction and Evaluation ---
y_pred = text_classifier.predict(X_test_processed)

# Calculate the required metrics (Evaluation Framework )
accuracy = accuracy_score(y_test, y_pred)
precision, recall, fscore, _ = precision_recall_fscore_support(
    y_test, y_pred, average='weighted', zero_division=0
)

print("\n--- Model Evaluation Results ---")
# Metrics specified in the Evaluation Framework [cite: 38, 40, 44]
print(f"Accuracy (40% weight): {accuracy:.4f}")
print(f"Precision (part of 30% weight): {precision:.4f}")
print(f"Recall (part of 30% weight): {recall:.4f}")
print(f"F1-Score (20% weight): {fscore:.4f}")

# Latency (10% weight) would require a real-time test (simulated here)
# In a real API, this would be measured using timing tools.
import time
start_time = time.time()
text_classifier.predict(X_test_processed.iloc[0:1]) # Classify a single ticket
end_time = time.time()
latency = (end_time - start_time) * 1000 # in milliseconds
print(f"Simulated Latency for single classification (10% weight): {latency:.2f} ms")


Training the model...

--- Model Evaluation Results ---
Accuracy (40% weight): 0.2100
Precision (part of 30% weight): 0.1222
Recall (part of 30% weight): 0.2100
F1-Score (20% weight): 0.1490
Simulated Latency for single classification (10% weight): 3.34 ms


In [6]:
# --- 4. Conceptual API Endpoint ---
# In a real environment, this model would be saved (e.g., using joblib/pickle)
# and loaded by a Flask/FastAPI application.

def classify_ticket_api(new_ticket_description):
    """
    Conceptual function for the API Endpoint[cite: 30].
    """
    # Preprocess the new ticket description
    processed_text = preprocess_text(new_ticket_description)

    # Classify the ticket using the trained model
    predicted_category = text_classifier.predict([processed_text])[0]

    return {"predicted_category": predicted_category}

# Example usage for the API
new_ticket = "The app crashes every time I try to save my profile changes. I think this is a bug."
prediction = classify_ticket_api(new_ticket)
print("\nNew Ticket Classification:", prediction)


New Ticket Classification: {'predicted_category': 'Account Management'}
