In [None]:
print('Radhe Radhe')

Radhe Radhe


In [None]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.8.1-py3-none-any.whl.metadata (7.9 kB)
Downloading category_encoders-2.8.1-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.8.1


In [None]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score
from xgboost import XGBClassifier

from category_encoders import TargetEncoder
from sklearn.preprocessing import LabelEncoder,StandardScaler


from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks,RandomUnderSampler
from imblearn.over_sampling import SMOTE

from sklearn.decomposition import PCA

from sklearn.metrics import accuracy_score , f1_score , precision_score,classification_report ,make_scorer ,roc_auc_score,recall_score

## Data Clean

In [None]:
Customer_support_data = pd.read_csv('/content/Customer_support_data.csv')

In [None]:
Customer_support_data.columns = [i.replace(" " , '_').replace("(" , '').replace(")" , '') for i in Customer_support_data.columns]

In [None]:
Customer_support_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85907 entries, 0 to 85906
Data columns (total 20 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Unique_id                85907 non-null  object 
 1   channel_name             85907 non-null  object 
 2   category                 85907 non-null  object 
 3   Sub-category             85907 non-null  object 
 4   Customer_Remarks         28742 non-null  object 
 5   Order_id                 67675 non-null  object 
 6   order_date_time          17214 non-null  object 
 7   Issue_reported_at        85907 non-null  object 
 8   issue_responded          85907 non-null  object 
 9   Survey_response_Date     85907 non-null  object 
 10  Customer_City            17079 non-null  object 
 11  Product_category         17196 non-null  object 
 12  Item_price               17206 non-null  float64
 13  connected_handling_time  242 non-null    float64
 14  Agent_name            

In [None]:
Customer_support_data.fillna(
  {'Customer_City': 'Unknown',
   'Product_category' : 'Unknown',
   'order_date_time' : Customer_support_data['order_date_time'].ffill(),
   'Item_price' : Customer_support_data['Item_price'].mean()

   }, inplace = True
)

In [None]:
Customer_support_data.drop(columns='connected_handling_time' ,inplace =True)

In [None]:
Customer_support_data = Customer_support_data.dropna()

In [None]:
Customer_support_data['order_date_time']= pd.to_datetime(Customer_support_data['order_date_time'] , format= '%d/%m/%Y %H:%M')
Customer_support_data['Issue_reported_at']= pd.to_datetime(Customer_support_data['Issue_reported_at'] , format= '%d/%m/%Y %H:%M')
Customer_support_data['issue_responded']= pd.to_datetime(Customer_support_data['issue_responded'] , format= '%d/%m/%Y %H:%M')
Customer_support_data['Survey_response_Date']= pd.to_datetime(Customer_support_data['Survey_response_Date'] , format= '%d-%b-%y')

In [None]:
dataset_date = Customer_support_data.select_dtypes(include='datetime64[ns]'  ).columns.to_list()

In [None]:
for col in dataset_date:
    Customer_support_data[col + '_year'] = Customer_support_data[col].dt.year
    Customer_support_data[col + '_month'] = Customer_support_data[col].dt.month
    Customer_support_data[col + '_day'] = Customer_support_data[col].dt.day
    Customer_support_data[col + '_month_name'] = Customer_support_data[col].dt.month_name()

In [None]:
# === 2) Feature Engineering ===
Customer_support_data['Response_Time'] = (Customer_support_data['issue_responded'] - Customer_support_data['Issue_reported_at']).dt.total_seconds() / 3600
Customer_support_data['Report_Delay'] = (Customer_support_data['Issue_reported_at'] - Customer_support_data['order_date_time']).dt.total_seconds() / 3600
Customer_support_data['Agent_Workload'] = Customer_support_data.groupby('Agent_name')['Order_id'].transform('count')
Customer_support_data['Survey_Month'] = Customer_support_data['Survey_response_Date_month']

In [None]:
Customer_support_data = Customer_support_data.drop(columns=dataset_date )

In [None]:
df = Customer_support_data.copy()

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22637 entries, 11 to 85904
Data columns (total 35 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Unique_id                        22637 non-null  object 
 1   channel_name                     22637 non-null  object 
 2   category                         22637 non-null  object 
 3   Sub-category                     22637 non-null  object 
 4   Customer_Remarks                 22637 non-null  object 
 5   Order_id                         22637 non-null  object 
 6   Customer_City                    22637 non-null  object 
 7   Product_category                 22637 non-null  object 
 8   Item_price                       22637 non-null  float64
 9   Agent_name                       22637 non-null  object 
 10  Supervisor                       22637 non-null  object 
 11  Manager                          22637 non-null  object 
 12  Tenure_Bucket         

## Model

### Encoding

In [None]:
print(df['CSAT_Score'].value_counts(normalize=True).mul(100).round(2))

CSAT_Score
5    67.84
1    19.15
4     9.00
3     2.61
2     1.41
Name: proportion, dtype: float64


In [None]:
# === 4) Drop obvious ID columns & set-up target ===
drop_ids = [c for c in ['Unique_id', 'Order_id'] if c in df.columns]
df = df.drop(columns=drop_ids)

In [None]:
# Encode target as 0..K-1 to be safe with sklearn/xgboost wrappers
le = LabelEncoder()
df['_y'] = le.fit_transform(df['CSAT_Score'])   # original labels (1..5) mapped to 0..4

In [None]:
# === 5) Define X, y and train/test split (stratified) ===
X = df.drop(columns=['CSAT_Score', '_y'])
y = df['_y']   # integer labels 0..4

In [None]:
# === 4) Remove constant columns ===
constant_cols = [col for col in X.columns if X[col].nunique() == 1]
print("Constant columns removed:", constant_cols)
X = X.drop(columns=constant_cols)

Constant columns removed: ['Issue_reported_at_year', 'issue_responded_year', 'issue_responded_month', 'issue_responded_month_name', 'Survey_response_Date_year', 'Survey_response_Date_month', 'Survey_response_Date_month_name', 'Survey_Month']


In [None]:
cat_cols = X.select_dtypes(include=['object']).columns.tolist()
print("Categorical Columns:", cat_cols)

# === 3) Split categorical into high-cardinality & low-cardinality ===
high_card_cols = [col for col in cat_cols if X[col].nunique() > 20]   # threshold = 20
low_card_cols  = [col for col in cat_cols if X[col].nunique() <= 20]

print("High-cardinality:", high_card_cols)
print("Low-cardinality:", low_card_cols)

Categorical Columns: ['channel_name', 'category', 'Sub-category', 'Customer_Remarks', 'Customer_City', 'Product_category', 'Agent_name', 'Supervisor', 'Manager', 'Tenure_Bucket', 'Agent_Shift', 'order_date_time_month_name', 'Issue_reported_at_month_name']
High-cardinality: ['Sub-category', 'Customer_Remarks', 'Customer_City', 'Agent_name', 'Supervisor']
Low-cardinality: ['channel_name', 'category', 'Product_category', 'Manager', 'Tenure_Bucket', 'Agent_Shift', 'order_date_time_month_name', 'Issue_reported_at_month_name']


In [None]:
X_encoded = X.copy()

# Target Encoding for high-cardinality cols
if high_card_cols:
    te = TargetEncoder(cols=high_card_cols)
    X_encoded[high_card_cols] = te.fit_transform(X[high_card_cols], y)

# Label Encoding for low-cardinality cols
if low_card_cols:
    le = LabelEncoder()
    for col in low_card_cols:
        X_encoded[col] = le.fit_transform(X[col])

### Train

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.20, stratify=y, random_state=42
)

In [None]:
y_train.value_counts()

Unnamed: 0_level_0,count
_y,Unnamed: 1_level_1
4,12284
0,3467
3,1630
2,472
1,256


In [None]:
# tl = TomekLinks(sampling_strategy='majority')
# X_train_res, y_train_res = tl.fit_resample(X_train, y_train)

In [None]:
# === 5) SMOTETomek Resampling (train only) ===
# smote_tomek = SMOTETomek(sampling_strategy='auto', random_state=42)

# X_train_res, y_train_res = smote_tomek.fit_resample(X_train, y_train)

In [None]:
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

In [None]:
print("NaN in X_train_res:", X_train_res.isna().sum().sum())
print("NaN in y_train_res:", pd.Series(y_train_res).isna().sum())
print("Inf in X_train_res:", np.isinf(X_train_res).sum().sum())

NaN in X_train_res: 0
NaN in y_train_res: 0
Inf in X_train_res: 0


In [None]:
# === 1) Initialize XGBoost Classifier ===
xgb_model = XGBClassifier(
    objective='multi:softprob',   # multi-class classification
    num_class=5,                 # since _y has 5 classes (0..4)
    eval_metric='mlogloss',
    random_state=42,
    n_estimators=700,
    learning_rate=0.1,
    colsample_bytree = 1.0,
    max_depth=7,
    reg_alpha=0.01,
    reg_lambda=1.5,
    subsample=0.8,
    min_child_weight = 1
)

# === 2) Train the model ===
xgb_model.fit(X_train_res, y_train_res)

In [None]:
# === 3) Make predictions ===
y_pred = xgb_model.predict(X_test)

# === 4) Evaluate the model ===
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print("XGBoost Classification Metrics:")
print(f"Accuracy : {accuracy:.4f}")
print(f"F1 Score : {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

XGBoost Classification Metrics:
Accuracy : 0.9329
F1 Score : 0.9293
Precision: 0.9304
Recall: 0.9329


In [None]:
# # 2) Predict probabilities for test set
y_proba = xgb_model.predict_proba(X_test)  # shape = (n_samples, n_classes)

# 3) Compute multi-class ROC-AUC (one-vs-rest)
roc_auc = roc_auc_score(y_test, y_proba, multi_class='ovr', average='weighted')

print("Weighted Multi-class ROC-AUC:", round(roc_auc, 4))

Weighted Multi-class ROC-AUC: 0.9695


In [None]:
param_grid = {
    'n_estimators': [500, 700],           # number of trees
    'max_depth': [5, 7],                  # depth of each tree
    'reg_alpha': [0, 0.01],               # L1 regularization
    'reg_lambda': [1, 1.5],               # L2 regularization
    'min_child_weight': [1, 3]            # minimum sum of instance weight needed in a child
}

xgb_model_cv = XGBClassifier(
    objective='multi:softprob',
    num_class=5,
    eval_metric='mlogloss',
    gamma=0,
    subsample = 0.8,
    learning_rate = 0.1,
    colsample_bytree = 1.0,
    random_state=42
)

grid_search = GridSearchCV(
    estimator=xgb_model_cv,
    param_grid=param_grid,
    scoring='f1_weighted',   # weighted F1 for imbalanced data
    cv=5,
    verbose=2
)

grid_search.fit(X_train_res, y_train_res)

print("Best parameters:", grid_search.best_params_)
print("Best weighted F1 score:", grid_search.best_score_)


Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV] END max_depth=5, min_child_weight=1, n_estimators=500, reg_alpha=0, reg_lambda=1; total time=  28.6s
[CV] END max_depth=5, min_child_weight=1, n_estimators=500, reg_alpha=0, reg_lambda=1; total time=  24.4s
[CV] END max_depth=5, min_child_weight=1, n_estimators=500, reg_alpha=0, reg_lambda=1; total time=  24.6s
[CV] END max_depth=5, min_child_weight=1, n_estimators=500, reg_alpha=0, reg_lambda=1; total time=  22.1s
[CV] END max_depth=5, min_child_weight=1, n_estimators=500, reg_alpha=0, reg_lambda=1; total time=  25.9s
[CV] END max_depth=5, min_child_weight=1, n_estimators=500, reg_alpha=0, reg_lambda=1.5; total time=  24.7s
[CV] END max_depth=5, min_child_weight=1, n_estimators=500, reg_alpha=0, reg_lambda=1.5; total time=  24.4s
[CV] END max_depth=5, min_child_weight=1, n_estimators=500, reg_alpha=0, reg_lambda=1.5; total time=  24.4s
[CV] END max_depth=5, min_child_weight=1, n_estimators=500, reg_alpha=0, reg_lambda=

In [None]:
# === 3) Make predictions ===
best_model  = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
# === 4) Evaluate the model ===
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print("XGBoost Classification Metrics:")
print(f"Accuracy : {accuracy:.4f}")
print(f"F1 Score : {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

XGBoost Classification Metrics:
Accuracy : 0.9342
F1 Score : 0.9306
Precision: 0.9319
Recall: 0.9342


In [None]:
# Predict probabilities for test set
y_proba = best_model.predict_proba(X_test)  # shape = (n_samples, n_classes)

# 3) Compute multi-class ROC-AUC (one-vs-rest)
roc_auc = roc_auc_score(y_test, y_proba, multi_class='ovr', average='weighted')

print("Weighted Multi-class ROC-AUC:", round(roc_auc, 4))

Weighted Multi-class ROC-AUC: 0.9695
