In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
import xgboost as xgb

In [4]:
from sklearn.model_selection import train_test_split,GridSearchCV

In [5]:
from sklearn.preprocessing import StandardScaler

In [7]:
from sklearn.metrics import classification_report,confusion_matrix,roc_auc_score

In [13]:
import matplotlib.pyplot as plt

In [15]:
%matplotlib inline

In [17]:
import seaborn as sns

In [163]:
def load_and_preprocess_data(file_path):
    
    df = pd.read_csv(file_path)

    
    numeric_cols = df.select_dtypes(include=['number']).columns
    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

    
    categorical_cols = ['Customer demographics','Product type']
    #for col in categorical_cols:
        #if col in df.columns:
          #  df[col].fillna('Unkown',inplace=True)
    print("Availability unique:", df['Availability'].unique())
    print("Stock levels min:", df['Stock levels'].min())
    print("Lead times median:", df['Lead times'].median())
    print("Lead times sample values:", df['Lead times'].sample(5))

        
    
    if 'Availability' in df.columns and 'Stock levels' in df.columns and 'Lead times' in df.columns:
        df['disruption'] = ((df['Availability'] <20) & 
                            (df['Stock levels'] < 20) & 
                            (df['Lead times'] > df['Lead times'].median())).astype(int)
    else:
        raise ValueError("Required columns for target generation are missing.")

    
    df = pd.get_dummies(df, drop_first=True)
    bool_cols = df.select_dtypes(include='bool').columns
    df[bool_cols] = df[bool_cols].astype(int)


    return df


In [165]:
"""def load_and_preprocess_data(file_path):
    df=pd.read_csv(file_path)
    df.fillna(df.median(),inplace=True)
    df['Customer demographics'].fillna('Unknown',inplace=True)
    df['disruption']= ((df['Availability']==0)&
                       (df['Stock levels']<20)&
                       (df['Lead times']>df['Lead times'].median())).astype(int)
    df=pd.get_dummies(df,columns =['Product type','SKU','Cusomer demographics'],drop_first=True)
    return df"""


"def load_and_preprocess_data(file_path):\n    df=pd.read_csv(file_path)\n    df.fillna(df.median(),inplace=True)\n    df['Customer demographics'].fillna('Unknown',inplace=True)\n    df['disruption']= ((df['Availability']==0)&\n                       (df['Stock levels']<20)&\n                       (df['Lead times']>df['Lead times'].median())).astype(int)\n    df=pd.get_dummies(df,columns =['Product type','SKU','Cusomer demographics'],drop_first=True)\n    return df"

In [167]:
def train_model(X_train,Y_train):
    X_train = X_train.astype(np.float32)
    Y_train = Y_train.astype(int)
    print("Train:", Y_train.value_counts())
    
    #print("Y_train distribution:\n", Y_train.value_counts())

    print(np.unique(Y_train))

    model = xgb.XGBClassifier(
        objective = 'binary:logistic',
        eval_metric='logloss',
        max_depth=6,
        learning_rate=0.05,
        n_estimators=100,
        n_jobs=-1,
        verbosity=0,
        base_score=0.5
    
    )

    param_grid = {
        'max_depth':[3,5,7],
        'learning_rate':[0.01,0.05,0.1],
        'n_estimators':[100,200],
        'subsample':[0.8,1.0],
        'colsample_bytree':[0.8,1.0]
    }
    grid_search = GridSearchCV(
        model,
        param_grid,
        cv=5,
        n_jobs=-1,
        verbose=1,
        scoring='accuracy',
        error_score='raise'
    )
    
    grid_search.fit(X_train,Y_train)

    best_model = grid_search.best_estimator_
    return best_model

In [169]:
def evaluate_model(model,X_test,Y_test):
    y_pred =model.predict(X_test)
    y_pred_prob = model.predict_proba(X_test)[:,1]
    print("Test:", Y_test.value_counts())

    cm = confusion_matrix(Y_test,y_pred)
    print("Confusion Matrix :")
    print(cm)

    print("\n Classification Report :")
    print(classification_report(Y_test,y_pred))

    roc_auc = roc_auc_score(Y_test,y_pred_prob)
    print(f"\n ROC AUC Score:{roc_auc:.4f}")

    return cm,roc_auc

In [173]:
def plot_feature_importance(model):
    xgb.plot_importance(model,importance_type='weight')
    plt.title("Feature Importance for Supply Chain Disruption Prediction")
    plt.tight_layout()
    plt.show()
    importance = model.get_booster().get_score(importance_type='weight')
    if not importance:
        print("Still no importance found â€” check data quality again.")
    else:
        print("Feature importances:", importance)
    

In [179]:
def main(file_path):
    df = load_and_preprocess_data(file_path)
    #print(df.dtypes)
   # print(df.isnull().sum())
    print("Target value counts:\n", df['disruption'].value_counts())
    
    feature_cols = [col for col in df.columns if col not in['disruption']]

    x=df[feature_cols]
    y=df['disruption']
    assert all(np.issubdtype(x[col].dtype, np.number) for col in x.columns), "Non-numeric data found!"
    print("Unique values in target:", np.unique(y))
    X_train,X_test,Y_train,Y_test = train_test_split(x,y,test_size=0.2,random_state=42,stratify=y)
    scaler= StandardScaler()
    X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=x.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), columns=x.columns)

    model=train_model(X_train,Y_train)

    cm,roc_auc =evaluate_model(model,X_test,Y_test)

    plot_feature_importance(model)

    return model,cm,roc_auc

In [181]:
if __name__=='__main__':
    file_path = '/Users/keerthika/Downloads/suply_chain.csv'
    model,cm,roc_auc=main(file_path)

    print("Model Trained and Executed Successfully")
    

Availability unique: [ 55  95  34  68  26  87  48  59  78  35  11  41   5  94  74  82  23 100
  22  60  30  32  73   9  42  12   3  10  28  43  63  96  75  97  98   6
   1  93  19  91  61  16  90  65  81  89  72  52  29  62  14  88  64  50
  56  13  99  83  18  24  58  44  17]
Stock levels min: 0
Lead times median: 17.0
Lead times sample values: 35    27
79    25
33    17
37     8
25    11
Name: Lead times, dtype: int64
Target value counts:
 disruption
0    100
Name: count, dtype: int64
Unique values in target: [0]
Train: disruption
0    80
Name: count, dtype: int64
[0]
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Test: disruption
0    20
Name: count, dtype: int64
Confusion Matrix :
[[20]]

 Classification Report :
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        20

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1



ValueError: Booster.get_score() results in empty.  This maybe caused by having all trees as decision dumps.