# ðŸ§  Customer Satisfaction Prediction â€“ Hyperparameter Tuning
# ========================================================

ðŸŽ¯ Purpose: Fine-tune your XGBoost model using GridSearchCV

# Import libraries and load dataset

In [21]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
import joblib

In [2]:
# Load data

data = pd.read_csv(r"\Users\Kaushlendra P Singh\OneDrive\Desktop\Internship\Project-1\customer_support_tickets.csv")

In [3]:
# Drop missing target rows
data = data[data['Customer Satisfaction Rating'].notna()]
data['Customer Satisfaction Rating'] = data['Customer Satisfaction Rating'].astype(int)

print("Dataset shape:", data.shape)
data.head()

Dataset shape: (2769, 17)


Unnamed: 0,Ticket ID,Customer Name,Customer Email,Customer Age,Customer Gender,Product Purchased,Date of Purchase,Ticket Type,Ticket Subject,Ticket Description,Ticket Status,Resolution,Ticket Priority,Ticket Channel,First Response Time,Time to Resolution,Customer Satisfaction Rating
2,3,Christopher Robbins,gonzalestracy@example.com,48,Other,Dell XPS,2020-07-14,Technical issue,Network problem,I'm facing a problem with my {product_purchase...,Closed,Case maybe show recently my computer follow.,Low,Social media,2023-06-01 11:14:38,2023-06-01 18:05:38,3
3,4,Christina Dillon,bradleyolson@example.org,27,Female,Microsoft Office,2020-11-13,Billing inquiry,Account access,I'm having an issue with the {product_purchase...,Closed,Try capital clearly never color toward story.,Low,Social media,2023-06-01 07:29:40,2023-06-01 01:57:40,3
4,5,Alexander Carroll,bradleymark@example.com,67,Female,Autodesk AutoCAD,2020-02-04,Billing inquiry,Data loss,I'm having an issue with the {product_purchase...,Closed,West decision evidence bit.,Low,Email,2023-06-01 00:12:42,2023-06-01 19:53:42,1
10,11,Joseph Moreno,mbrown@example.org,48,Male,Nintendo Switch,2021-01-19,Cancellation request,Data loss,I'm having an issue with the {product_purchase...,Closed,Measure tonight surface feel forward.,High,Phone,2023-06-01 17:46:49,2023-05-31 23:51:49,1
11,12,Brandon Arnold,davisjohn@example.net,51,Male,Microsoft Xbox Controller,2021-10-24,Product inquiry,Software bug,I'm having an issue with the {product_purchase...,Closed,Measure there house management pick knowledge ...,High,Chat,2023-06-01 12:05:51,2023-06-01 09:27:51,1


# Prepare features and preprocessing pipeline


In [29]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2769 entries, 2 to 8467
Data columns (total 18 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   Ticket ID                     2769 non-null   int64 
 1   Customer Name                 2769 non-null   object
 2   Customer Email                2769 non-null   object
 3   Customer Age                  2769 non-null   int64 
 4   Customer Gender               2769 non-null   object
 5   Product Purchased             2769 non-null   object
 6   Date of Purchase              2769 non-null   object
 7   Ticket Type                   2769 non-null   object
 8   Ticket Subject                2769 non-null   object
 9   Ticket Description            2769 non-null   object
 10  Ticket Status                 2769 non-null   object
 11  Resolution                    2769 non-null   object
 12  Ticket Priority               2769 non-null   object
 13  Ticket Channel         

In [4]:
# Map 5-class -> 3-class
def map_rating(x):
    if x <= 2:
        return 0
    elif x == 3:
        return 1
    else:
        return 2

data['Satisfaction_Class'] = data['Customer Satisfaction Rating'].apply(map_rating)
print('3-class distribution:')
print(data['Satisfaction_Class'].value_counts(normalize=True))

3-class distribution:
Satisfaction_Class
0    0.397978
2    0.392560
1    0.209462
Name: proportion, dtype: float64


In [5]:
# Separate features and target
X = data.drop(columns=['Customer Satisfaction Rating', 'Ticket ID', 'Customer Email', 'Customer Name'])
y = data['Satisfaction_Class']

In [6]:
# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [26]:
# Identify numeric and categorical columns
numeric_features = X_train.select_dtypes(include=['int64','float64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

In [27]:
# Build transformers
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[('encoder', OneHotEncoder(handle_unknown='ignore'))])

In [28]:
# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Define pipeline and parameter grid

In [10]:
# Base XGBoost pipeline
pipeline_xgb = Pipeline([
    ('preproc', preprocessor),
    ('clf', xgb.XGBClassifier(
        objective='multi:softprob',
        num_class=5,
        eval_metric='mlogloss',
        random_state=42
    ))
])

In [11]:
# Parameter grid for tuning
param_grid = {
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [3, 5, 7],
    'clf__learning_rate': [0.05, 0.1],
    'clf__subsample': [0.8, 1.0],
    'clf__colsample_bytree': [0.8, 1.0]
}

# Run GridSearchCV

In [12]:
print("Running GridSearchCV... (this can take several minutes)")

grid_search = GridSearchCV(
    estimator=pipeline_xgb,
    param_grid=param_grid,
    cv=3,
    scoring='accuracy',
    verbose=2,
    n_jobs=-1
)

Running GridSearchCV... (this can take several minutes)


In [13]:
# Use a smaller sample for speed (optional)
X_sample = X_train.sample(n=min(1000, X_train.shape[0]), random_state=42)
y_sample = y_train.loc[X_sample.index]


In [14]:
grid_search.fit(X_sample, y_sample)

Fitting 3 folds for each of 48 candidates, totalling 144 fits


In [15]:
print("Best Parameters:", grid_search.best_params_)
print("Best CV Accuracy:", grid_search.best_score_)

Best Parameters: {'clf__colsample_bytree': 0.8, 'clf__learning_rate': 0.05, 'clf__max_depth': 3, 'clf__n_estimators': 100, 'clf__subsample': 0.8}
Best CV Accuracy: 1.0


# Evaluate tuned model

In [16]:
# Best model
best_model = grid_search.best_estimator_

In [30]:
# Predict
y_pred_best = best_model.predict(X_test)

In [18]:
# Evaluate
print("Tuned XGBoost Accuracy:", accuracy_score(y_test, y_pred_best))
print("\nClassification Report:\n", classification_report(y_test, y_pred_best))

Tuned XGBoost Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       221
           1       1.00      1.00      1.00       116
           2       1.00      1.00      1.00       217

    accuracy                           1.00       554
   macro avg       1.00      1.00      1.00       554
weighted avg       1.00      1.00      1.00       554



In [19]:
# Save model
joblib.dump(best_model, '../Project-1/csat_xgb_pipeline_tuned.joblib')
print("âœ… Tuned model saved to /mnt/data/csat_xgb_pipeline_tuned.joblib")

âœ… Tuned model saved to /mnt/data/csat_xgb_pipeline_tuned.joblib
