In [5]:
# Import ML libraries
import pycaret
import xgboost

# Snowpark for Python
from snowflake.snowpark.session import Session
import snowflake.snowpark.types as T
import snowflake.snowpark.functions as F
import snowflake.snowpark as snowpark
from snowflake.snowpark.functions import datediff, to_date, col, expr

# Import Misc
import json
import pandas as pd
from pycaret.classification import setup, compare_models


In [6]:
# Create Snowflake Session object
connection_parameters = json.load(open("connection.json"))
session = Session.builder.configs(connection_parameters).create()

In [10]:
# Connecting to the correct table
tableName = 'PURCHASE_ORDER_HISTORY'
dataframe = session.table(tableName)

# Calculation to find the lag between Planned Delivery from Actual Delivery
dataframe = dataframe.withColumn("target_feature",
                                    datediff('day', 
                                            col("DELIVERY_DATE_ML"), 
                                            col("FIRST_GR_POSTING_DATE_ML")))


# Example: Selecting specific columns
# This selects only a subset of columns. Adjust the column names as needed.
filtered_dataframe = dataframe.select(
    col("PURCHASE_DOCUMENT_ITEM_ID"), # ID for purchase order
    col("CREATE_DATE_ML"),            # day purchase order was created
    col("COMPANY_CODE_ID"),           # copmany w/in INVISTA making purchase
    col("VENDOR_ID"),                 # ID of the vendor "we" are purchasing from
    col("POSTAL_CD"),                 # postal code associated w company code ID
    col("MATERIAL_ID"),               # ID of material being purchase
    col("SUB_COMMODITY_DESC"),        # description of sub commodity
    col("MRP_TYPE_ID"),               # determined if material is reordered manually or automatically
    col("PLANT_ID"),                  # ID of plant making purchase
    col("REQUESTED_DELIVERY_DATE_ML"),# delivery date from requisition
    col("INBOUND_DELIVERY_ID"),       # ID for delivery
    col("INBOUND_DELIVERY_ITEM_ID"),  # ID of item w/in delivery
    col("PLANNED_DELIVERY_DAYS"),     # Amount of days expected to take
    col("FIRST_GR_POSTING_DATE_ML"),  # expected delivery date        
    col("target_feature")             # Lag between Planned Delivery from Actual Delivery 
)


# Print a sample of the filtered dataframe to standard output.
filtered_dataframe.show()

# Optionally, you might want to filter rows based on some conditions
# Example: Filtering out rows where FIRST_GR_POSTING_DATE_ML is NULL
filtered_dataframe = filtered_dataframe.filter(col("FIRST_GR_POSTING_DATE_ML").is_not_null())

# Show the DataFrame after filtering
filtered_dataframe.show()

------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"PURCHASE_DOCUMENT_ITEM_ID"  |"CREATE_DATE_ML"  |"COMPANY_CODE_ID"  |"VENDOR_ID"  |"POSTAL_CD"  |"MATERIAL_ID"  |"SUB_COMMODITY_DESC"                    |"MRP_TYPE_ID"  |"PLANT_ID"  |"REQUESTED_DELIVERY_DATE_ML"  |"INBOUND_DELIVERY_ID"  |"INBOUND_DELIVERY_ITEM_ID"  |"PLANNED_DELIVERY_DAYS"  |"FIRST_GR_POSTING_DATE_ML"  |"TARGET_FEATURE"  |
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [12]:
# Assuming 'filtered_dataframe' is the DataFrame you've prepared in Snowflake
# Convert the Snowpark DataFrame to a Pandas DataFrame with consideration for NULL values

# Convert DataFrame to Pandas, handling NULL values by allowing float conversion
df = filtered_dataframe.fillna(0).to_pandas()  # This replaces NULL with 0 before conversion

Input value type doesn't match the target column data type, this replacement was skipped. Column Name: "CREATE_DATE_ML", Type: DateType(), Input Value: 0, Type: <class 'int'>
Input value type doesn't match the target column data type, this replacement was skipped. Column Name: "COMPANY_CODE_ID", Type: StringType(16777216), Input Value: 0, Type: <class 'int'>
Input value type doesn't match the target column data type, this replacement was skipped. Column Name: "VENDOR_ID", Type: StringType(16777216), Input Value: 0, Type: <class 'int'>
Input value type doesn't match the target column data type, this replacement was skipped. Column Name: "POSTAL_CD", Type: StringType(16777216), Input Value: 0, Type: <class 'int'>
Input value type doesn't match the target column data type, this replacement was skipped. Column Name: "SUB_COMMODITY_DESC", Type: StringType(16777216), Input Value: 0, Type: <class 'int'>
Input value type doesn't match the target column data type, this replacement was skipped. 

In [13]:
df.head()

Unnamed: 0,PURCHASE_DOCUMENT_ITEM_ID,CREATE_DATE_ML,COMPANY_CODE_ID,VENDOR_ID,POSTAL_CD,MATERIAL_ID,SUB_COMMODITY_DESC,MRP_TYPE_ID,PLANT_ID,REQUESTED_DELIVERY_DATE_ML,INBOUND_DELIVERY_ID,INBOUND_DELIVERY_ITEM_ID,PLANNED_DELIVERY_DAYS,FIRST_GR_POSTING_DATE_ML,TARGET_FEATURE
0,10,2018-09-07,CN15,V4014,29078,2100007708,Custom Manufacturing,1,1016,2018-11-16,183615169,900001,52.0,2018-11-22,6
1,10,2018-09-07,CA10,8010003146,L6L 6R2,1100125572,Piping & Tubing,1,4036,2018-10-01,0,0,24.0,2018-12-05,23
2,180,2018-09-07,CA10,8010005836,N2C 0B7,0,Material Handling,0,4036,2018-09-08,0,0,0.0,2018-09-18,10
3,60,2018-09-07,CA10,8010005836,N2C 0B7,0,Material Handling,0,4036,2018-09-08,0,0,0.0,2018-09-18,10
4,20,2018-09-07,CA10,8010003152,L5W 0A1,1100181639,Safety Supplies,1,4036,2018-09-21,0,0,11.0,2018-09-13,-8


In [16]:
# Transform 'target_feature' to binary: 0 if value is positive, 1 if value is negative
df['binary_target'] = df['TARGET_FEATURE'].apply(lambda x: 0 if x > 0 else 1)

In [17]:
df['binary_target']

0          0
1          0
2          0
3          0
4          1
          ..
1009111    0
1009112    1
1009113    0
1009114    0
1009115    0
Name: binary_target, Length: 1009116, dtype: int64

In [19]:
# Remove the original 'target_feature' column from the DataFrame
df = df.drop(columns=['TARGET_FEATURE'])


In [21]:
# Setup the PyCaret environment for classification with the binary target
clf_setup = setup(data=df, target='binary_target', session_id=123)

# Proceed with comparing models, model training, and evaluation
best_model = compare_models()

Unnamed: 0,Description,Value
0,Session id,123
1,Target,binary_target
2,Target type,Binary
3,Original data shape,"(1009116, 15)"
4,Transformed data shape,"(1009116, 15)"
5,Transformed train set shape,"(706381, 15)"
6,Transformed test set shape,"(302735, 15)"
7,Numeric features,6
8,Categorical features,8
9,Rows with missing values,22.6%


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.8739,0.9516,0.8998,0.8775,0.8885,0.7433,0.7436,42.333
dt,Decision Tree Classifier,0.8441,0.8416,0.8629,0.8586,0.8608,0.6835,0.6836,1.97
knn,K Neighbors Classifier,0.7361,0.8129,0.7805,0.7553,0.7677,0.4624,0.4628,6.972
gbc,Gradient Boosting Classifier,0.7175,0.796,0.7692,0.7368,0.7526,0.4238,0.4243,41.303
ada,Ada Boost Classifier,0.7001,0.7722,0.768,0.7159,0.741,0.3859,0.3873,10.953
ridge,Ridge Classifier,0.6984,0.0,0.7848,0.7073,0.744,0.3793,0.3823,0.699
lda,Linear Discriminant Analysis,0.6984,0.7655,0.784,0.7076,0.7439,0.3794,0.3824,1.029
qda,Quadratic Discriminant Analysis,0.6521,0.738,0.5368,0.7708,0.6328,0.3218,0.341,2.564
nb,Naive Bayes,0.583,0.5412,0.9424,0.5777,0.7163,0.077,0.1232,0.688
lr,Logistic Regression,0.5761,0.5581,0.8534,0.5823,0.6923,0.0838,0.1013,3.652


Processing:   0%|          | 0/65 [00:00<?, ?it/s]