In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

  from IPython.core.display import display, HTML


In [2]:
import pandas as pd
df_original = pd.read_parquet('Package_ML.parquet.gzip')

In [3]:
# XGBoos Model gives importance results for ServiceTypeCode, MailClassCode

import shap
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

data = df_original.copy()

# Handle missing values for weather data
imputer = SimpleImputer(strategy='mean')
data[['TMIN_O', 'TMAX_O', 'PRCP_O', 'SNOW_O', 'TMIN_D', 'TMAX_D', 'PRCP_D', 'SNOW_D']] = imputer.fit_transform(data[['TMIN_O', 'TMAX_O', 'PRCP_O', 'SNOW_O', 'TMIN_D', 'TMAX_D', 'PRCP_D', 'SNOW_D']])

# Encode categorical variables
label_encoder_service = LabelEncoder()
label_encoder_mail = LabelEncoder()

data['ServiceTypeCode'] = label_encoder_service.fit_transform(data['ServiceTypeCode'])
data['MailClassCode'] = label_encoder_mail.fit_transform(data['MailClassCode'])

# Mapping from encoded values to original values
encoded_to_original_service_map = {i: label for i, label in enumerate(label_encoder_service.classes_)}
encoded_to_original_mail_map = {i: label for i, label in enumerate(label_encoder_mail.classes_)}

# Encode ZIP codes
data['Zip_O'] = label_encoder_service.fit_transform(data['Zip_O'])
data['Zip_D'] = label_encoder_service.fit_transform(data['Zip_D'])

# Separate features and target variable
X = data.drop('late', axis=1)
y = data['late']

# Split the data for cross-validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# XGBoost classifier
model = XGBClassifier(eval_metric='logloss')

# Hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9]
}

# GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

# Fit the model
grid_search.fit(X_train, y_train)

# Best parameters and best score
print(f'Best parameters: {grid_search.best_params_}')
print(f'Best cross-validation accuracy: {grid_search.best_score_}')

# Fit the model on the entire training set with best parameters
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Predictions
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# AUC
auc = roc_auc_score(y_test, y_pred_proba)
print(f'AUC: {auc}')

# Classification report
class_report = classification_report(y_test, y_pred)
print(f'Classification Report:\n{class_report}')

# Feature importance
feature_importance = best_model.feature_importances_
importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importance
}).sort_values(by='Importance', ascending=False)

print(importance_df)

# SHAP Analysis for ServiceTypeCode and MailClassCode
# Initialize and setup the SHAP explainer with the model
explainer = shap.TreeExplainer(best_model)

# SHAP values for the test set
shap_values = explainer.shap_values(X_test)

# SHAP values for the ServiceTypeCode feature
servicetype_shap_values = shap_values[:, list(X.columns).index('ServiceTypeCode')]

# DataFrame for the ServiceTypeCode SHAP values
servicetype_shap_df = pd.DataFrame({
    'ServiceTypeCode': X_test['ServiceTypeCode'],
    'SHAP Value': servicetype_shap_values
})

# Map back to original ServiceTypeCode values
servicetype_shap_df['ServiceTypeCode'] = servicetype_shap_df['ServiceTypeCode'].map(encoded_to_original_service_map)

# Mean SHAP value for each ServiceTypeCode
mean_shap_values_service = servicetype_shap_df.groupby('ServiceTypeCode')['SHAP Value'].mean().sort_values(ascending=False)
print("Mean SHAP values for ServiceTypeCode:")
print(mean_shap_values_service)

# Get the SHAP values for the MailClassCode feature
mailclass_shap_values = shap_values[:, list(X.columns).index('MailClassCode')]

# DataFrame for the MailClassCode SHAP values
mailclass_shap_df = pd.DataFrame({
    'MailClassCode': X_test['MailClassCode'],
    'SHAP Value': mailclass_shap_values
})

# Map back to original MailClassCode values
mailclass_shap_df['MailClassCode'] = mailclass_shap_df['MailClassCode'].map(encoded_to_original_mail_map)

# Mean SHAP value for each MailClassCode
mean_shap_values_mail = mailclass_shap_df.groupby('MailClassCode')['SHAP Value'].mean().sort_values(ascending=False)
print("Mean SHAP values for MailClassCode:")
print(mean_shap_values_mail)


Fitting 5 folds for each of 243 candidates, totalling 1215 fits
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.7; total time=   0.3s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.9; total time=   0.2s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.9; total time=   0.2s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.8; total time=   0.3s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.9; total time=   0.4s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=3, n_estimators=300, subsample=0.8; total time=   0.4s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=4, n_estimators=100, subsample=0.7; total time=   0.2s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=4, n_estimators=100, subsample=0.7; total time=   0.2s
[CV] END

[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.3s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.7; total time=   0.4s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=3, n_estimators=300, subsample=0.7; total time=   0.4s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=3, n_estimators=300, subsample=0.8; total time=   0.4s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=4, n_estimators=100, subsample=0.7; total time=   0.2s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=4, n_estimators=100, subsample=0.8; total time=   0.2s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=4, n_estimators=100, subsample=0.9; total time=   0.2s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=4, n_estimators=200, subsample=0.8; total time=   0.3s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=4, n_estima

[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.2s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.9; total time=   0.2s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.8; total time=   0.3s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.9; total time=   0.3s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=3, n_estimators=300, subsample=0.8; total time=   0.4s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=3, n_estimators=300, subsample=0.9; total time=   0.4s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=4, n_estimators=100, subsample=0.9; total time=   0.2s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=4, n_estimators=200, subsample=0.7; total time=   0.3s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=4, n_estima

[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=3, n_estimators=100, subsample=0.7; total time=   0.2s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=3, n_estimators=100, subsample=0.7; total time=   0.2s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=3, n_estimators=100, subsample=0.7; total time=   0.2s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=3, n_estimators=100, subsample=0.9; total time=   0.2s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=3, n_estimators=200, subsample=0.7; total time=   0.4s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=3, n_estimators=200, subsample=0.8; total time=   0.4s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=3, n_estimators=300, subsample=0.7; total time=   0.5s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=3, n_estimators=300, subsample=0.9; total time=   0.5s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=4, n_estimators=100,

[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=3, n_estimators=100, subsample=0.9; total time=   0.2s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=3, n_estimators=200, subsample=0.8; total time=   0.4s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=3, n_estimators=300, subsample=0.7; total time=   0.5s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=3, n_estimators=300, subsample=0.8; total time=   0.5s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=4, n_estimators=100, subsample=0.7; total time=   0.2s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=4, n_estimators=100, subsample=0.8; total time=   0.3s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=4, n_estimators=100, subsample=0.9; total time=   0.3s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=4, n_estimators=200, subsample=0.8; total time=   0.4s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=4, n_estimators=300,

[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=4, n_estimators=100, subsample=0.7; total time=   0.2s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=4, n_estimators=100, subsample=0.7; total time=   0.3s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=4, n_estimators=100, subsample=0.8; total time=   0.3s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=4, n_estimators=200, subsample=0.7; total time=   0.4s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=4, n_estimators=200, subsample=0.9; total time=   0.5s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=4, n_estimators=300, subsample=0.8; total time=   0.6s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=4, n_estimators=300, subsample=0.9; total time=   0.6s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=5, n_estimators=100, subsample=0.8; total time=   0.3s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=5, n_estimators=100,

[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=4, n_estimators=100, subsample=0.7; total time=   0.3s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=4, n_estimators=100, subsample=0.9; total time=   0.2s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=4, n_estimators=200, subsample=0.7; total time=   0.4s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=4, n_estimators=200, subsample=0.9; total time=   0.5s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=4, n_estimators=300, subsample=0.8; total time=   0.6s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=4, n_estimators=300, subsample=0.9; total time=   0.6s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=5, n_estimators=100, subsample=0.8; total time=   0.3s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=5, n_estimators=200, subsample=0.7; total time=   0.5s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=5, n_estimators=200,

Accuracy: 0.8940302770636961
AUC: 0.9326720180435
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.97      0.94      2804
           1       0.82      0.60      0.69       697

    accuracy                           0.89      3501
   macro avg       0.86      0.78      0.81      3501
weighted avg       0.89      0.89      0.89      3501

                 Feature  Importance
12                PRCP_D    0.128770
1          MailClassCode    0.127190
3     time_delta_minutes    0.116278
0        ServiceTypeCode    0.089193
5                 TMIN_O    0.087774
10                TMIN_D    0.085041
7                 PRCP_O    0.067663
9                  Zip_D    0.061041
11                TMAX_D    0.059322
4                  Zip_O    0.050921
8                 SNOW_O    0.046543
13                SNOW_D    0.040666
6                 TMAX_O    0.039598
2   Distinct_event_scans    0.000000
Mean SHAP values for ServiceTypeCode:
Service