In [31]:
# Import ML libraries
import pycaret
import xgboost

# Snowpark for Python
from snowflake.snowpark.session import Session
import snowflake.snowpark.types as T
import snowflake.snowpark.functions as F
from snowflake.snowpark.functions import datediff, to_date, col, expr, sproc

# Import Misc
import json
import pandas as pd
from pycaret.regression import setup, compare_models


In [32]:
import psutil
def cpu_count_patched(logical=False):
    import os
    return os.cpu_count()
psutil.cpu_count = cpu_count_patched

In [33]:
# Create Snowflake Session object
connection_parameters = json.load(open("connection.json"))

session = Session.builder.configs(connection_parameters).create()

In [34]:

# Connecting to the correct table
tableName = 'PURCHASE_ORDER_HISTORY'
dataframe = session.table(tableName)

# Calculation to find the lag between Planned Delivery from Actual Delivery
dataframe = dataframe.withColumn("target_feature",
                                    datediff('day', 
                                            col("DELIVERY_DATE_ML"), 
                                            col("FIRST_GR_POSTING_DATE_ML")))

# Example: Selecting specific columns
# This selects only a subset of columns. Adjust the column names as needed.
filtered_dataframe = dataframe.select(
    col("PURCHASE_DOCUMENT_ITEM_ID"), # ID for purchase order
    col("VENDOR_ID"),                 # ID of the vendor "we" are purchasing from
    col("POSTAL_CD"),                 # postal code associated w company code ID
    col("PLANT_ID"),                  # ID of plant making purchase
    col("MATERIAL_ID"),               # ID of material being purchase
    col("MRP_TYPE_ID"),               # determined if material is reordered manually or automatically
    col("COMPANY_CODE_ID"),           # copmany w/in INVISTA making purchase
    col("SUB_COMMODITY_DESC"),        # description of sub commodity
    col("INBOUND_DELIVERY_ID"),       # ID for delivery
    col("INBOUND_DELIVERY_ITEM_ID"),  # ID of item w/in delivery
    col("CREATE_DATE_ML"),            # date product was made
    col("FIRST_GR_POSTING_DATE_ML"),  # expected delivery date
    col("REQUESTED_DELIVERY_DATE_ML"),# delivery date from requisition 
    col("PLANNED_DELIVERY_DAYS"),     # expected days to be delivered
    col("target_feature")             # Lag between Planned Delivery from Actual Delivery 
)

# Print a sample of the filtered dataframe to standard output.
filtered_dataframe.show()

# # Optionally, you might want to filter rows based on some conditions
# # Example: Filtering out rows where FIRST_GR_POSTING_DATE_ML is NULL
# filtered_dataframe = filtered_dataframe.filter(col("FIRST_GR_POSTING_DATE_ML").is_not_null())

# Show the DataFrame after filtering
filtered_dataframe.show()

------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"PURCHASE_DOCUMENT_ITEM_ID"  |"VENDOR_ID"  |"POSTAL_CD"  |"PLANT_ID"  |"MATERIAL_ID"  |"MRP_TYPE_ID"  |"COMPANY_CODE_ID"  |"SUB_COMMODITY_DESC"                    |"INBOUND_DELIVERY_ID"  |"INBOUND_DELIVERY_ITEM_ID"  |"CREATE_DATE_ML"  |"FIRST_GR_POSTING_DATE_ML"  |"REQUESTED_DELIVERY_DATE_ML"  |"PLANNED_DELIVERY_DAYS"  |"TARGET_FEATURE"  |
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [35]:
#  Assuming 'filtered_dataframe' is the DataFrame you've prepared in Snowflake
# Convert the Snowpark DataFrame to a Pandas DataFrame with consideration for NULL values

# Convert DataFrame to Pandas, handling NULL values by allowing float conversion
df = filtered_dataframe.fillna(0).to_pandas()  # This replaces NULL with 0 before conversion

Input value type doesn't match the target column data type, this replacement was skipped. Column Name: "VENDOR_ID", Type: StringType(16777216), Input Value: 0, Type: <class 'int'>
Input value type doesn't match the target column data type, this replacement was skipped. Column Name: "POSTAL_CD", Type: StringType(16777216), Input Value: 0, Type: <class 'int'>
Input value type doesn't match the target column data type, this replacement was skipped. Column Name: "COMPANY_CODE_ID", Type: StringType(16777216), Input Value: 0, Type: <class 'int'>
Input value type doesn't match the target column data type, this replacement was skipped. Column Name: "SUB_COMMODITY_DESC", Type: StringType(16777216), Input Value: 0, Type: <class 'int'>
Input value type doesn't match the target column data type, this replacement was skipped. Column Name: "CREATE_DATE_ML", Type: DateType(), Input Value: 0, Type: <class 'int'>
Input value type doesn't match the target column data type, this replacement was skipped. 

Input value type doesn't match the target column data type, this replacement was skipped. Column Name: "PLANNED_DELIVERY_DAYS", Type: StringType(16777216), Input Value: 0, Type: <class 'int'>


In [36]:
df.head()

Unnamed: 0,PURCHASE_DOCUMENT_ITEM_ID,VENDOR_ID,POSTAL_CD,PLANT_ID,MATERIAL_ID,MRP_TYPE_ID,COMPANY_CODE_ID,SUB_COMMODITY_DESC,INBOUND_DELIVERY_ID,INBOUND_DELIVERY_ITEM_ID,CREATE_DATE_ML,FIRST_GR_POSTING_DATE_ML,REQUESTED_DELIVERY_DATE_ML,PLANNED_DELIVERY_DAYS,TARGET_FEATURE
0,20,,,4007,2100021412,1,CA10,Tolling,185610163,20,2021-03-30,2021-03-30,2021-03-31,0.0,-1
1,10,8010094262.0,201799.0,1032,0,0,CN20,Telecommunications media services,0,0,2021-03-30,,2021-03-31,0.0,0
2,20,8010098163.0,200333.0,1032,0,0,CN20,Maintenance Services,0,0,2021-03-30,2021-05-11,2021-04-06,0.0,35
3,10,8010019798.0,201604.0,1026,0,0,CN16,Power Generation Equipment,0,0,2021-03-30,2021-04-09,2021-04-06,0.0,3
4,10,8010099718.0,30165.0,4016,2300006415,1,US10,Custom Manufacturing,185639199,10,2021-03-30,2021-04-13,2021-04-09,5.0,4


In [37]:
# Setup the PyCaret environment for classification with the binary target
clf_setup = setup(data=df, target='TARGET_FEATURE', session_id=123, use_gpu=True)

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [None]:
stage_name = 'Carret'

_ = session.sql('CREATE STAGE IF NOT EXISTS '+ stage_name).collect()

In [None]:
@sproc(name='forecast_order_sproc', 
       stage_location='@Carret', 
       packages=['snowflake-snowpark-python','pycaret','pandas', 'xgboost'],
       is_permanent=True, 
       replace=True, 
       session=session,
       execute_as='caller')
def forecast_order(session: Session, forecast_id: str) -> None:
   from pycaret.time_series import RegressionExperiment
   import pandas as pd
   import os

   # Create a Snowpark DataFrame, filter and convert to pandas series
   #  snowpark_df = session.table('PURCHASE_ORDER_HISTORY').filter(col('TIME_SERIES') == 'store_1_item_1')
   #  pandas_df = snowpark_df.to_pandas()
   #  pandas_df['DATE'] = pd.to_datetime(pandas_df['DATE'])

   #  #setup and run the experiment
   #  all_ts = pandas_df['TIME_SERIES'].unique()

   all_ts = filtered_dataframe.select('TIME_SERIES').distinct().collect()

   for i in all_ts:
      df_subset = pandas_df[pandas_df['TIME_SERIES'] == i]      
      #setup and run the experiment
      exp = RegressionExperiment()
      s = exp.setup(df_subset, target = 'TARGET_FEATURE', train_size = 0.95,
                     data_split_shuffle = False, fold_strategy = 'timeseries', fold = 3,
                     # ignore_features = ['DATE', 'TIME_SERIES'],
                     # numeric_features = ['DAY_OF_YEAR', 'YEAR'],
                     # categorical_features = ['MONTH', 'DAY_OF_WEEK'],
                     preprocess=True,
                     verbose = True,
                     use_gpu=False,
                     log_experiment = False, experiment_name = i, log_plots = False)
      
      best_model = exp.compare_models(sort = 'MAE', verbose=True)       
           
      #save best model
      exp.save_model(best_model, f'forecast_model_{i}_{forecast_id}')
      
      session.file.put(
         f"/tmp/forecast_model_{i}_{forecast_id}.pkl",
         "@PYCARET_TEST_MODELS",
         auto_compress=False,
         overwrite=True
      )

      # save experiment results to Snwoflake table
      exp_results = exp.pull()
      session.write_pandas(exp_results, table_name=f'EXP_RESULTS_{i}_{forecast_id}', auto_create_table=True, overwrite=False)
      
return

Package 'xgboost' is not installed in the local environment. Your UDF might not work when the package is installed on the server but not on your local environment.


TypeError: cannot pickle '_thread.lock' object: you might have to save the unpicklable object in the local environment first, add it to the UDF with session.add_import(), and read it from the UDF.