# 3. Model Training for IT Support Tickets

This notebook trains and evaluates regression models and saves the best model.


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import xgboost as xgb
import joblib
import os
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay

# Set up input and output paths
DATA_PATH = '../data/processed'
MODEL_PATH = '../models'
os.makedirs(MODEL_PATH, exist_ok=True)

# Configure logging
import logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

In [7]:
# Load processed data
logging.info("Loading processed data...")
df = pd.read_csv(os.path.join(DATA_PATH, 'processed_tickets.csv'))

# Select features
logging.info("Preparing features...")
numerical_features = ['priority_critical', 'priority_high', 'priority_low', 
                     'priority_medium', 'priority_very_low']
categorical_features = ['type', 'queue', 'language']

# Load TF-IDF features
logging.info("Loading TF-IDF features...")
tfidf_matrix = joblib.load(os.path.join(DATA_PATH, 'tfidf_features.joblib')).toarray()

# Create feature arrays
X = np.column_stack([
    df[numerical_features].values,  # Numerical features
    pd.get_dummies(df[categorical_features]).values,  # One-hot encoded categorical features
    tfidf_matrix  # TF-IDF text features
])

# Target variable
y = df['queue']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

logging.info(f"Training data shape: {X_train.shape}")
logging.info(f"Testing data shape: {X_test.shape}")
logging.info(f"Number of classes: {len(y.unique())}")
logging.info("\nClass distribution:")
print(y.value_counts(normalize=True))

2025-09-30 20:07:24,816 - INFO - Loading processed data...
2025-09-30 20:07:28,219 - INFO - Preparing features...
2025-09-30 20:07:28,221 - INFO - Loading TF-IDF features...
2025-09-30 20:07:28,219 - INFO - Preparing features...
2025-09-30 20:07:28,221 - INFO - Loading TF-IDF features...
2025-09-30 20:07:38,880 - INFO - Training data shape: (108624, 1077)
2025-09-30 20:07:38,925 - INFO - Testing data shape: (27156, 1077)
2025-09-30 20:07:38,955 - INFO - Number of classes: 62
2025-09-30 20:07:38,880 - INFO - Training data shape: (108624, 1077)
2025-09-30 20:07:38,925 - INFO - Testing data shape: (27156, 1077)
2025-09-30 20:07:38,955 - INFO - Number of classes: 62
2025-09-30 20:07:38,957 - INFO - 
Class distribution:
2025-09-30 20:07:38,957 - INFO - 
Class distribution:


queue
Technical Support                    0.228355
Product Support                      0.142142
Customer Service                     0.118530
IT Support                           0.090882
Billing and Payments                 0.076771
                                       ...   
Account & Billing Management         0.003211
Security Operations                  0.003152
User Experience & Design Feedback    0.002754
Customer Onboarding                  0.002666
Software Development                 0.002504
Name: proportion, Length: 62, dtype: float64


In [13]:
print("\nAvailable columns:")
print(df.columns.tolist())


Available columns:
['subject', 'body', 'answer', 'queue', 'language', 'version', 'tag_1', 'tag_2', 'tag_3', 'tag_4', 'tag_5', 'tag_6', 'tag_7', 'tag_8', 'tag_9', 'priority_critical', 'priority_high', 'priority_low', 'priority_medium', 'priority_very_low', 'subject_length', 'subject_urgency_keywords', 'subject_tfidf_0', 'subject_tfidf_1', 'subject_tfidf_2', 'subject_tfidf_3', 'subject_tfidf_4', 'subject_tfidf_5', 'subject_tfidf_6', 'subject_tfidf_7', 'subject_tfidf_8', 'subject_tfidf_9', 'subject_tfidf_10', 'subject_tfidf_11', 'subject_tfidf_12', 'subject_tfidf_13', 'subject_tfidf_14', 'subject_tfidf_15', 'subject_tfidf_16', 'subject_tfidf_17', 'subject_tfidf_18', 'subject_tfidf_19', 'subject_tfidf_20', 'subject_tfidf_21', 'subject_tfidf_22', 'subject_tfidf_23', 'subject_tfidf_24', 'subject_tfidf_25', 'subject_tfidf_26', 'subject_tfidf_27', 'subject_tfidf_28', 'subject_tfidf_29', 'subject_tfidf_30', 'subject_tfidf_31', 'subject_tfidf_32', 'subject_tfidf_33', 'subject_tfidf_34', 'subjec

In [8]:
# Check columns
print("\nAvailable columns:")
for col in df.columns:
    print(f"- {col}")


Available columns:
- subject
- body
- answer
- queue
- language
- version
- tag_1
- tag_2
- tag_3
- tag_4
- tag_5
- tag_6
- tag_7
- tag_8
- tag_9
- priority_critical
- priority_high
- priority_low
- priority_medium
- priority_very_low
- subject_length
- subject_urgency_keywords
- subject_tfidf_0
- subject_tfidf_1
- subject_tfidf_2
- subject_tfidf_3
- subject_tfidf_4
- subject_tfidf_5
- subject_tfidf_6
- subject_tfidf_7
- subject_tfidf_8
- subject_tfidf_9
- subject_tfidf_10
- subject_tfidf_11
- subject_tfidf_12
- subject_tfidf_13
- subject_tfidf_14
- subject_tfidf_15
- subject_tfidf_16
- subject_tfidf_17
- subject_tfidf_18
- subject_tfidf_19
- subject_tfidf_20
- subject_tfidf_21
- subject_tfidf_22
- subject_tfidf_23
- subject_tfidf_24
- subject_tfidf_25
- subject_tfidf_26
- subject_tfidf_27
- subject_tfidf_28
- subject_tfidf_29
- subject_tfidf_30
- subject_tfidf_31
- subject_tfidf_32
- subject_tfidf_33
- subject_tfidf_34
- subject_tfidf_35
- subject_tfidf_36
- subject_tfidf_37
- subjec

In [None]:
# Encode the target variable
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Save the label encoder
joblib.dump(label_encoder, os.path.join(MODEL_PATH, 'label_encoder.joblib'))
logging.info("Saved label encoder")

# Initialize XGBoost classifier with optimized parameters
model = xgb.XGBClassifier(
    max_depth=8,
    learning_rate=0.1,
    n_estimators=200,
    objective='multi:softmax',
    num_class=len(label_encoder.classes_),
    tree_method='hist',
    random_state=42,
    n_jobs=-1  # Use all CPU cores
)

# Train the model
logging.info("Training XGBoost model...")
model.fit(X_train, y_train_encoded)

# Save the model
model_path = os.path.join(MODEL_PATH, 'ticket_classifier.json')
model.save_model(model_path)
logging.info(f"Saved model to {model_path}")

# Make predictions
y_pred = model.predict(X_test)
y_pred_labels = label_encoder.inverse_transform(y_pred)

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_labels))

2025-09-30 20:08:11,857 - INFO - Saved label encoder
2025-09-30 20:08:11,864 - INFO - Training XGBoost model...
2025-09-30 20:08:11,864 - INFO - Training XGBoost model...


In [None]:
# Train and evaluate models
results = {}

def eval_and_log(name, model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    results[name] = {'r2': r2, 'mae': mae, 'rmse': rmse, 'model': model}
    print(f"{name} -> R2: {r2:.3f}, MAE: {mae:.3f}, RMSE: {rmse:.3f}")

eval_and_log('LinearRegression', LinearRegression())
eval_and_log('XGBoost', XGBRegressor(learning_rate=0.1, n_estimators=200, random_state=42, n_jobs=-1))

rf = RandomForestRegressor(random_state=42, n_jobs=-1)
params = {'n_estimators': [100, 200], 'max_depth': [None, 10, 20], 'min_samples_split': [2, 5]}
grid = GridSearchCV(rf, params, cv=3, scoring='r2', n_jobs=-1)
grid.fit(X_train, y_train)
rf_r2 = r2_score(y_test, grid.predict(X_test))
rf_mae = mean_absolute_error(y_test, grid.predict(X_test))
rf_rmse = mean_squared_error(y_test, grid.predict(X_test), squared=False)
results['RandomForest'] = {'r2': rf_r2, 'mae': rf_mae, 'rmse': rf_rmse, 'model': grid.best_estimator_}
print(f"RandomForest -> R2: {rf_r2:.3f}, MAE: {rf_mae:.3f}, RMSE: {rf_rmse:.3f}")


In [9]:
# Select best model and save
best_name = max(results.items(), key=lambda kv: kv[1]['r2'])[0]
best_model = results[best_name]['model']
print('Best model:', best_name, 'R2:', results[best_name]['r2'])

if results[best_name]['r2'] < 0.8:
    df_retry = df[df['resolution_time_hours'] < df['resolution_time_hours'].quantile(0.95)]
    X = df_retry.drop(['resolution_time_hours', 'ticket_id', 'creation_timestamp', 'closure_timestamp', 'description'], axis=1, errors='ignore')
    y = df_retry['resolution_time_hours']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    rf_retry = RandomForestRegressor(random_state=42, n_jobs=-1, n_estimators=200)
    rf_retry.fit(X_train, y_train)
    best_model = rf_retry

joblib.dump(best_model, os.path.join(MODELS_DIR, 'reg_model.pkl'))

plt.figure(figsize=(6,6))
plt.scatter(y_test, best_model.predict(X_test), alpha=0.5)
plt.xlabel('Actual Resolution Time (h)')
plt.ylabel('Predicted Resolution Time (h)')
plt.title('Predicted vs Actual')
plt.tight_layout()
plt.savefig(os.path.join('..', 'outputs', 'pred_vs_actual.png'))
plt.close()

print('Training complete. Best model saved to models/reg_model.pkl')


NameError: name 'results' is not defined