In [None]:
# Import ML libraries
import pycaret
import xgboost
import numpy as np

# Snowpark for Python
from snowflake.snowpark.session import Session
import snowflake.snowpark.types as T
import snowflake.snowpark.functions as F
import snowflake.snowpark as snowpark
from snowflake.snowpark.functions import datediff, to_date, col, expr

# Import Misc
import json
import pandas as pd
from pycaret.classification import setup, compare_models, tune_model, plot_model, interpret_model, create_model


In [None]:
# Create Snowflake Session object
connection_parameters = json.load(open("connection.json"))
session = Session.builder.configs(connection_parameters).create()

In [None]:
# Connecting to the correct table
tableName = 'PURCHASE_ORDER_HISTORY'
dataframe = session.table(tableName)

# Calculation to find the lag between Planned Delivery from Actual Delivery
dataframe = dataframe.withColumn("target_feature",
                                    datediff('day', 
                                            col("DELIVERY_DATE_ML"), 
                                            col("FIRST_GR_POSTING_DATE_ML")))


# Example: Selecting specific columns
# This selects only a subset of columns. Adjust the column names as needed.
filtered_dataframe = dataframe.select(
    col("PURCHASE_DOCUMENT_ITEM_ID"), # ID for purchase order
    col("CREATE_DATE_ML"),            # day purchase order was created
    col("COMPANY_CODE_ID"),           # copmany w/in INVISTA making purchase
    col("VENDOR_ID"),                 # ID of the vendor "we" are purchasing from
    col("POSTAL_CD"),                 # postal code associated w company code ID
    col("MATERIAL_ID"),               # ID of material being purchase
    col("SUB_COMMODITY_DESC"),        # description of sub commodity
    col("MRP_TYPE_ID"),               # determined if material is reordered manually or automatically
    col("PLANT_ID"),                  # ID of plant making purchase
    col("REQUESTED_DELIVERY_DATE_ML"),# delivery date from requisition
    col("INBOUND_DELIVERY_ID"),       # ID for delivery
    col("INBOUND_DELIVERY_ITEM_ID"),  # ID of item w/in delivery
    col("PLANNED_DELIVERY_DAYS"),     # Amount of days expected to take
    col("FIRST_GR_POSTING_DATE_ML"),  # expected delivery date      
    col("target_feature")             # Lag between Planned Delivery from Actual Delivery 
)


# Print a sample of the filtered dataframe to standard output.
filtered_dataframe.show()

# Optionally, you might want to filter rows based on some conditions
# Example: Filtering out rows where FIRST_GR_POSTING_DATE_ML is NULL
filtered_dataframe = filtered_dataframe.filter(col("FIRST_GR_POSTING_DATE_ML").is_not_null())

# Show the DataFrame after filtering
filtered_dataframe.show()

In [None]:
# Assuming 'filtered_dataframe' is the DataFrame you've prepared in Snowflake
# Convert the Snowpark DataFrame to a Pandas DataFrame with consideration for NULL values

# Convert DataFrame to Pandas, handling NULL values by allowing float conversion
df = filtered_dataframe.fillna(0).to_pandas()  # This replaces NULL with 0 before conversion

In [None]:
df.head()

In [None]:
df_filtered = df[(df['TARGET_FEATURE'] <= 100) & (df['TARGET_FEATURE'] >= -100)]


In [None]:
df_filtered.head()

In [None]:
# Define bins from -100 to 100, with each bin covering a 20-day range
bins = list(range(-100, 101, 10))  # This creates bins at every 20 units from -100 to 100

# Create labels for these bins
labels = [f'{i} to {i+9}' for i in bins[:-1]]  # Exclude the last bin edge for labeling

# Categorize the days into bins
df_filtered['time_slot'] = pd.cut(df_filtered['TARGET_FEATURE'], bins=bins, labels=labels, right=False)

In [None]:
df_filtered = df_filtered.dropna(subset=['time_slot'])

In [None]:
df_filtered['time_slot'].value_counts()

In [None]:
df_filtered.head()

In [None]:
# Remove the original 'target_feature' column from the DataFrame
df_filtered = df_filtered.drop(columns=['TARGET_FEATURE'])


In [None]:
# Setup the PyCaret environment for classification
clf_setup = setup(data=df_filtered, target='time_slot', session_id=123, use_gpu=True, n_jobs=10)


In [None]:
# List of model IDs you're interested in
# 'rf' - Random Forest, 'et' - Extra Trees, 'dt' - Decision Tree
model_ids = ['rf', 'et', 'dt']

# Use compare_models but only for the specified models
best_model = compare_models(include=model_ids, fold=5, round=4)

In [1]:
# Creating models individually
rf_model = create_model('rf')
et_model = create_model('et')
dt_model = create_model('dt')

In [None]:
plot_model(rf_model, plot='confusion_matrix')
plot_model(et_model, plot='confusion_matrix')
plot_model(dt_model, plot='confusion_matrix')

In [None]:
plot_model(rf_model, plot='confusion_matrix')
plot_model(et_model, plot='confusion_matrix')
plot_model(dt_model, plot='confusion_matrix')