In [1]:
# Import ML libraries
import pycaret
import xgboost
import numpy as np

# Snowpark for Python
from snowflake.snowpark.session import Session
import snowflake.snowpark.types as T
import snowflake.snowpark.functions as F
import snowflake.snowpark as snowpark
from snowflake.snowpark.functions import datediff, to_date, col, expr

# Import Misc
import json
import pandas as pd
from pycaret.classification import setup, compare_models, tune_model, plot_model, interpret_model, create_model

In [2]:
# Create Snowflake Session object
connection_parameters = json.load(open("connection.json"))
session = Session.builder.configs(connection_parameters).create()

In [3]:
# Connecting to the correct table
tableName = 'PURCHASE_ORDER_HISTORY'
dataframe = session.table(tableName)

# Calculation to find the lag between Planned Delivery from Actual Delivery
dataframe = dataframe.withColumn("target_feature",
                                    datediff('day', 
                                            col("DELIVERY_DATE_ML"), 
                                            col("FIRST_GR_POSTING_DATE_ML")))


# Example: Selecting specific columns
# This selects only a subset of columns. Adjust the column names as needed.
filtered_dataframe = dataframe.select(
    col("PURCHASE_DOCUMENT_ITEM_ID"), # ID for purchase order
    col("CREATE_DATE_ML"),            # day purchase order was created
    col("COMPANY_CODE_ID"),           # copmany w/in INVISTA making purchase
    col("VENDOR_ID"),                 # ID of the vendor "we" are purchasing from
    col("POSTAL_CD"),                 # postal code associated w company code ID
    col("MATERIAL_ID"),               # ID of material being purchase
    col("SUB_COMMODITY_DESC"),        # description of sub commodity
    col("MRP_TYPE_ID"),               # determined if material is reordered manually or automatically
    col("PLANT_ID"),                  # ID of plant making purchase
    col("REQUESTED_DELIVERY_DATE_ML"),# delivery date from requisition
    col("INBOUND_DELIVERY_ID"),       # ID for delivery
    col("INBOUND_DELIVERY_ITEM_ID"),  # ID of item w/in delivery
    col("PLANNED_DELIVERY_DAYS"),     # Amount of days expected to take
    col("FIRST_GR_POSTING_DATE_ML"),  # expected delivery date        
    col("target_feature")             # Lag between Planned Delivery from Actual Delivery 
)


# Print a sample of the filtered dataframe to standard output.
filtered_dataframe.show()

# Optionally, you might want to filter rows based on some conditions
# Example: Filtering out rows where FIRST_GR_POSTING_DATE_ML is NULL
filtered_dataframe = filtered_dataframe.filter(col("FIRST_GR_POSTING_DATE_ML").is_not_null())

# Show the DataFrame after filtering
filtered_dataframe.show()

------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"PURCHASE_DOCUMENT_ITEM_ID"  |"CREATE_DATE_ML"  |"COMPANY_CODE_ID"  |"VENDOR_ID"  |"POSTAL_CD"  |"MATERIAL_ID"  |"SUB_COMMODITY_DESC"                    |"MRP_TYPE_ID"  |"PLANT_ID"  |"REQUESTED_DELIVERY_DATE_ML"  |"INBOUND_DELIVERY_ID"  |"INBOUND_DELIVERY_ITEM_ID"  |"PLANNED_DELIVERY_DAYS"  |"FIRST_GR_POSTING_DATE_ML"  |"TARGET_FEATURE"  |
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [4]:
# Assuming 'filtered_dataframe' is the DataFrame you've prepared in Snowflake
# Convert the Snowpark DataFrame to a Pandas DataFrame with consideration for NULL values

# Convert DataFrame to Pandas, handling NULL values by allowing float conversion
df = filtered_dataframe.fillna(0).to_pandas()  # This replaces NULL with 0 before conversion

Input value type doesn't match the target column data type, this replacement was skipped. Column Name: "CREATE_DATE_ML", Type: DateType(), Input Value: 0, Type: <class 'int'>
Input value type doesn't match the target column data type, this replacement was skipped. Column Name: "COMPANY_CODE_ID", Type: StringType(16777216), Input Value: 0, Type: <class 'int'>
Input value type doesn't match the target column data type, this replacement was skipped. Column Name: "VENDOR_ID", Type: StringType(16777216), Input Value: 0, Type: <class 'int'>
Input value type doesn't match the target column data type, this replacement was skipped. Column Name: "POSTAL_CD", Type: StringType(16777216), Input Value: 0, Type: <class 'int'>
Input value type doesn't match the target column data type, this replacement was skipped. Column Name: "SUB_COMMODITY_DESC", Type: StringType(16777216), Input Value: 0, Type: <class 'int'>
Input value type doesn't match the target column data type, this replacement was skipped. 

In [5]:
df.head()

Unnamed: 0,PURCHASE_DOCUMENT_ITEM_ID,CREATE_DATE_ML,COMPANY_CODE_ID,VENDOR_ID,POSTAL_CD,MATERIAL_ID,SUB_COMMODITY_DESC,MRP_TYPE_ID,PLANT_ID,REQUESTED_DELIVERY_DATE_ML,INBOUND_DELIVERY_ID,INBOUND_DELIVERY_ITEM_ID,PLANNED_DELIVERY_DAYS,FIRST_GR_POSTING_DATE_ML,TARGET_FEATURE
0,10,2021-02-10,US39,8010000984,60045-5202,0,Tools,0,4120,2021-02-15,0,0,0.0,2021-02-11,-4
1,20,2021-02-10,US10,8010000984,60045-5202,0,Machinery & Equipment,0,4014,2021-02-14,185520728,20,1.0,2021-02-12,-2
2,20,2021-02-10,CA10,V4138,19973,2100032775,Tolling,1,4007,2021-02-16,185529650,10,2.0,2021-02-16,0
3,10,2021-02-10,CA10,8010002419,H9J 4A1,1100119629,Valves,1,4036,2021-02-24,0,0,14.0,2021-04-06,41
4,30,2021-02-10,CA10,8010002454,K7L 4Y5,3100006053,Tubes & Cores,1,4036,2021-02-09,0,0,1.0,2021-02-09,0


In [6]:
# Transform 'target_feature' to binary: 0 if value is positive, 1 if value is negative
df['binary_target'] = df['TARGET_FEATURE'].apply(lambda x: 0 if x > 0 else 1)

In [7]:
# Remove the original 'target_feature' column from the DataFrame
df = df.drop(columns=['TARGET_FEATURE'])


In [8]:
# Setup the PyCaret environment for classification with the binary target
clf_setup = setup(data=df, target='binary_target', session_id=123, use_gpu=True, n_jobs=10)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,binary_target
2,Target type,Binary
3,Original data shape,"(1009116, 15)"
4,Transformed data shape,"(1009116, 15)"
5,Transformed train set shape,"(706381, 15)"
6,Transformed test set shape,"(302735, 15)"
7,Numeric features,6
8,Categorical features,8
9,Rows with missing values,22.6%


In [9]:
# Correctly creating a Random Forest model
rf_model = create_model('rf')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8752,0.951,0.8986,0.8804,0.8894,0.7461,0.7463
1,0.8729,0.9516,0.9013,0.875,0.8879,0.7412,0.7417
2,0.8728,0.9516,0.8988,0.8766,0.8876,0.7412,0.7415
3,0.8753,0.9523,0.9004,0.8793,0.8897,0.7463,0.7466
4,0.8723,0.9509,0.9009,0.8743,0.8874,0.7399,0.7404
5,0.873,0.9513,0.9004,0.8758,0.8879,0.7415,0.7419
6,0.8739,0.951,0.8989,0.8783,0.8885,0.7436,0.7438
7,0.8728,0.9503,0.8995,0.8762,0.8877,0.7412,0.7416
8,0.8743,0.9521,0.9003,0.8779,0.889,0.7443,0.7446
9,0.8743,0.9519,0.9004,0.8777,0.8889,0.7442,0.7445


In [10]:
# Tuning the Random Forest model
tuned_rf = tune_model(rf_model, optimize='Accuracy', n_iter=50)

Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 50 candidates, totalling 500 fits


In [None]:
# Feature Importance
plot_model(tuned_rf, plot='feature')

# SHAP Values
plot_model(tuned_rf, plot='shap')

# Confusion Matrix
plot_model(tuned_rf, plot='confusion_matrix')

# AUC
plot_model(tuned_rf, plot='auc')

# Precision-Recall Curve
plot_model(tuned_rf, plot='pr')

# Global Interpretation
interpret_model(tuned_rf)

# Local Interpretation for a specific observation
interpret_model(tuned_rf, observation=10)  # Ensure you have a specific observation to pass here