In [78]:
# Import ML libraries
import pycaret
import xgboost

# Snowpark for Python
from snowflake.snowpark.session import Session
import snowflake.snowpark.types as T
import snowflake.snowpark.functions as F
import snowflake.snowpark as snowpark
from snowflake.snowpark.functions import datediff, to_date, col, expr

# Import Misc
import json
import pandas as pd
# from pycaret.classification import setup, compare_models


In [79]:
# Create Snowflake Session object
connection_parameters = json.load(open("connection.json"))
session = Session.builder.configs(connection_parameters).create()

In [80]:
# Connecting to the correct table
tableName = 'PURCHASE_ORDER_HISTORY'
dataframe = session.table(tableName)

# Calculation to find the lag between Planned Delivery from Actual Delivery
dataframe = dataframe.withColumn("target_feature",
                                    datediff('day', 
                                            col("DELIVERY_DATE_ML"), 
                                            col("FIRST_GR_POSTING_DATE_ML")))


# Example: Selecting specific columns
# This selects only a subset of columns. Adjust the column names as needed.
filtered_dataframe = dataframe.select(
    col("PURCHASE_DOCUMENT_ITEM_ID"), # ID for purchase order
    col("CREATE_DATE"),            # day purchase order was created
    col("COMPANY_CODE_ID"),           # copmany w/in INVISTA making purchase
    col("VENDOR_ID"),                 # ID of the vendor "we" are purchasing from
    col("POSTAL_CD"),                 # postal code associated w company code ID
    col("MATERIAL_ID"),               # ID of material being purchase
    col("SUB_COMMODITY_DESC"),        # description of sub commodity
    col("MRP_TYPE_ID"),               # determined if material is reordered manually or automatically
    col("PLANT_ID"),                  # ID of plant making purchase
    col("REQUESTED_DELIVERY_DATE"),# delivery date from requisition
    col("INBOUND_DELIVERY_ID"),       # ID for delivery
    col("INBOUND_DELIVERY_ITEM_ID"),  # ID of item w/in delivery
    col("PLANNED_DELIVERY_DAYS"),     # Amount of days expected to take
    col("FIRST_GR_POSTING_DATE"),  # expected delivery date        
    col("target_feature")             # Lag between Planned Delivery from Actual Delivery 
)


# Print a sample of the filtered dataframe to standard output.
filtered_dataframe.show()

# Optionally, you might want to filter rows based on some conditions
# Example: Filtering out rows where FIRST_GR_POSTING_DATE_ML is NULL
filtered_dataframe = filtered_dataframe.filter(col("FIRST_GR_POSTING_DATE").is_not_null())

# filtered_dataframe = filtered_dataframe[filtered_dataframe['PLANNED_DELIVERY_DAYS'] < 6]

# Show the DataFrame after filtering
filtered_dataframe.show()

---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"PURCHASE_DOCUMENT_ITEM_ID"  |"CREATE_DATE"  |"COMPANY_CODE_ID"  |"VENDOR_ID"  |"POSTAL_CD"  |"MATERIAL_ID"  |"SUB_COMMODITY_DESC"                    |"MRP_TYPE_ID"  |"PLANT_ID"  |"REQUESTED_DELIVERY_DATE"  |"INBOUND_DELIVERY_ID"  |"INBOUND_DELIVERY_ITEM_ID"  |"PLANNED_DELIVERY_DAYS"  |"FIRST_GR_POSTING_DATE"  |"TARGET_FEATURE"  |
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [81]:
# Assuming 'filtered_dataframe' is the DataFrame you've prepared in Snowflake
# Convert the Snowpark DataFrame to a Pandas DataFrame with consideration for NULL values

# Convert DataFrame to Pandas, handling NULL values by allowing float conversion
df = filtered_dataframe.fillna(0).to_pandas()  # This replaces NULL with 0 before conversion

Input value type doesn't match the target column data type, this replacement was skipped. Column Name: "COMPANY_CODE_ID", Type: StringType(16777216), Input Value: 0, Type: <class 'int'>
Input value type doesn't match the target column data type, this replacement was skipped. Column Name: "VENDOR_ID", Type: StringType(16777216), Input Value: 0, Type: <class 'int'>
Input value type doesn't match the target column data type, this replacement was skipped. Column Name: "POSTAL_CD", Type: StringType(16777216), Input Value: 0, Type: <class 'int'>
Input value type doesn't match the target column data type, this replacement was skipped. Column Name: "SUB_COMMODITY_DESC", Type: StringType(16777216), Input Value: 0, Type: <class 'int'>
Input value type doesn't match the target column data type, this replacement was skipped. Column Name: "PLANNED_DELIVERY_DAYS", Type: StringType(16777216), Input Value: 0, Type: <class 'int'>


In [82]:
df.head()

Unnamed: 0,PURCHASE_DOCUMENT_ITEM_ID,CREATE_DATE,COMPANY_CODE_ID,VENDOR_ID,POSTAL_CD,MATERIAL_ID,SUB_COMMODITY_DESC,MRP_TYPE_ID,PLANT_ID,REQUESTED_DELIVERY_DATE,INBOUND_DELIVERY_ID,INBOUND_DELIVERY_ITEM_ID,PLANNED_DELIVERY_DAYS,FIRST_GR_POSTING_DATE,TARGET_FEATURE
0,10,20180907,CN15,V4014,29078,2100007708,Custom Manufacturing,1,1016,20181116,183615169,900001,52.0,20181122,6
1,20,20180907,CN20,8010095928,201206,0,Tanks and Process Equipment,0,1032,20180928,0,0,0.0,0,0
2,10,20180907,CA10,8010003146,L6L 6R2,1100125572,Piping & Tubing,1,4036,20181001,0,0,24.0,20181205,23
3,180,20180907,CA10,8010005836,N2C 0B7,0,Material Handling,0,4036,20180908,0,0,0.0,20180918,10
4,60,20180907,CA10,8010005836,N2C 0B7,0,Material Handling,0,4036,20180908,0,0,0.0,20180918,10


In [83]:
df['TARGET_FEATURE']

0           6
1           0
2          23
3          10
4          10
           ..
1139387    -2
1139388    -1
1139389     0
1139390    -1
1139391   -27
Name: TARGET_FEATURE, Length: 1139392, dtype: int32

In [84]:
import numpy as np
import torch


In [85]:

# Handle missing values
# data.fillna(0, inplace=True)  

# Impute missing SUB_COMMODITY_DESC
df['SUB_COMMODITY_DESC'].fillna('Unknown', inplace=True)

# Convert categorical columns to one-hot encoding
df = pd.get_dummies(df, columns=['SUB_COMMODITY_DESC'])

# from sklearn.preprocessing import OneHotEncoder

# encoder = OneHotEncoder(sparse=False)  # Dense output
# encoded_data = encoder.fit_transform(df[['SUB_COMMODITY_DESC']])


In [86]:
import pandas as pd
import re

def clean_delivery_days(value):
    if isinstance(value, str):
        # Remove leading/trailing whitespace
        value = value.strip() 

        # Check for timestamp format and handle separately
        if re.match(r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}", value):  
            return ""  # Replace timestamps with NA or another placeholder
        else:
            return value
    else:
        return value

df['PLANNED_DELIVERY_DAYS'] = df['PLANNED_DELIVERY_DAYS'].apply(clean_delivery_days)


In [87]:
# Hash Alphanumeric columns
df['VENDOR_ID'] = df['VENDOR_ID'].apply(hash)
df['POSTAL_CD'] = df['POSTAL_CD'].apply(hash)
df['COMPANY_CODE_ID'] = df['COMPANY_CODE_ID'].apply(hash)




In [88]:
def remove_decimal(value):
    return value.split('.')[0]  # Split by the decimal and keep the integer part

df['PLANNED_DELIVERY_DAYS'] = df['PLANNED_DELIVERY_DAYS'].apply(remove_decimal)

for value in df['PLANNED_DELIVERY_DAYS'].unique():
    print(value, type(value), repr(value)) 



52 <class 'str'> '52'
0 <class 'str'> '0'
24 <class 'str'> '24'
11 <class 'str'> '11'
1 <class 'str'> '1'
36 <class 'str'> '36'
3 <class 'str'> '3'
10 <class 'str'> '10'
5 <class 'str'> '5'
14 <class 'str'> '14'
4 <class 'str'> '4'
60 <class 'str'> '60'
45 <class 'str'> '45'
32 <class 'str'> '32'
40 <class 'str'> '40'
31 <class 'str'> '31'
7 <class 'str'> '7'
126 <class 'str'> '126'
28 <class 'str'> '28'
15 <class 'str'> '15'
42 <class 'str'> '42'
30 <class 'str'> '30'
6 <class 'str'> '6'
150 <class 'str'> '150'
90 <class 'str'> '90'
70 <class 'str'> '70'
8 <class 'str'> '8'
35 <class 'str'> '35'
75 <class 'str'> '75'
50 <class 'str'> '50'
59 <class 'str'> '59'
21 <class 'str'> '21'
12 <class 'str'> '12'
33 <class 'str'> '33'
16 <class 'str'> '16'
2 <class 'str'> '2'
9 <class 'str'> '9'
18 <class 'str'> '18'
29 <class 'str'> '29'
106 <class 'str'> '106'
13 <class 'str'> '13'
209 <class 'str'> '209'
120 <class 'str'> '120'
20 <class 'str'> '20'
65 <class 'str'> '65'
27 <class 'str'> '27

In [89]:
def convert_to_numeric(col):
    try:
        df[col] = pd.to_numeric(df[col])
    except ValueError:
        print(f"Error converting column '{col}': Contains non-numeric values")

convert_to_numeric('PURCHASE_DOCUMENT_ITEM_ID')
convert_to_numeric('CREATE_DATE')
convert_to_numeric('VENDOR_ID')
convert_to_numeric('POSTAL_CD')
convert_to_numeric('MATERIAL_ID')
convert_to_numeric('MRP_TYPE_ID')
convert_to_numeric('PLANT_ID')
convert_to_numeric('INBOUND_DELIVERY_ID')
convert_to_numeric('INBOUND_DELIVERY_ITEM_ID')
convert_to_numeric('PLANNED_DELIVERY_DAYS')
convert_to_numeric('FIRST_GR_POSTING_DATE')


In [90]:
# Display the dataframe
df.head()


Unnamed: 0,PURCHASE_DOCUMENT_ITEM_ID,CREATE_DATE,COMPANY_CODE_ID,VENDOR_ID,POSTAL_CD,MATERIAL_ID,MRP_TYPE_ID,PLANT_ID,REQUESTED_DELIVERY_DATE,INBOUND_DELIVERY_ID,...,SUB_COMMODITY_DESC_Tools,SUB_COMMODITY_DESC_Transport operations services,"SUB_COMMODITY_DESC_Transportation, Storage, Mail Services",SUB_COMMODITY_DESC_Travel Services,SUB_COMMODITY_DESC_Tubes & Cores,SUB_COMMODITY_DESC_Unknown,SUB_COMMODITY_DESC_Valves,SUB_COMMODITY_DESC_Vehicles,SUB_COMMODITY_DESC_Waste Disposal Services,SUB_COMMODITY_DESC_Water Treatment Chemicals
0,10,20180907,3703839810055977782,-5099027605412357075,7642908023448878140,2100007708,1,1016,20181116,183615169,...,0,0,0,0,0,0,0,0,0,0
1,20,20180907,-985305713257791738,-7106197524279199263,-7844425448173683110,0,0,1032,20180928,0,...,0,0,0,0,0,0,0,0,0,0
2,10,20180907,3848063693701144670,-3663116255602633744,7681583353488874575,1100125572,1,4036,20181001,0,...,0,0,0,0,0,0,0,0,0,0
3,180,20180907,3848063693701144670,6882345628143072556,1700578447253300114,0,0,4036,20180908,0,...,0,0,0,0,0,0,0,0,0,0
4,60,20180907,3848063693701144670,6882345628143072556,1700578447253300114,0,0,4036,20180908,0,...,0,0,0,0,0,0,0,0,0,0


In [91]:

# Show types of each column
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1139392 entries, 0 to 1139391
Data columns (total 87 columns):
 #   Column                                                     Non-Null Count    Dtype  
---  ------                                                     --------------    -----  
 0   PURCHASE_DOCUMENT_ITEM_ID                                  1139392 non-null  int16  
 1   CREATE_DATE                                                1139392 non-null  int32  
 2   COMPANY_CODE_ID                                            1139392 non-null  int64  
 3   VENDOR_ID                                                  1139392 non-null  int64  
 4   POSTAL_CD                                                  1139392 non-null  int64  
 5   MATERIAL_ID                                                1139392 non-null  int64  
 6   MRP_TYPE_ID                                                1139392 non-null  int8   
 7   PLANT_ID                                                   1139392 non-n

In [92]:

# Create feature & target tensors on GPU
features = df.drop('TARGET_FEATURE', axis=1)
targets = df['TARGET_FEATURE']
X = torch.tensor(features.values.astype(np.float32))
y = torch.tensor(targets.values.astype(np.float32))



# Split the data into training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Create a model
# from torch import nn
# import torch.nn.functional as F

# class Net(nn.Module):
#     def __init__(self):
#         super(Net, self).__init__()
#         self.fc1 = nn.Linear(23, 10)  # 23 input features, 10 output features
#         self.fc2 = nn.Linear(10, 1)   # 10 input features, 1 output feature

#     def forward(self, x):
#         x = F.relu(self.fc1(x))
#         x = self.fc2(x)
#         return x
    
# model = Net()


X

tensor([[ 1.0000e+01,  2.0181e+07,  3.7038e+18,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 2.0000e+01,  2.0181e+07, -9.8531e+17,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 1.0000e+01,  2.0181e+07,  3.8481e+18,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        ...,
        [ 1.0000e+01,  2.0211e+07, -6.8996e+18,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 1.0000e+01,  2.0211e+07,  3.7246e+18,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 1.0000e+01,  2.0211e+07, -6.8996e+18,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00]])

In [93]:
y

tensor([  6.,   0.,  23.,  ...,   0.,  -1., -27.])

In [94]:
import torch
import torch.nn as nn
import torch.nn.functional as F



In [95]:
# define the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

X = X.to(device)
y = y.to(device)

In [96]:


class RegressionNetwork(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.hidden_size1 = 128  
        self.hidden_size2 = 64  

        self.linear1 = nn.Linear(input_size, self.hidden_size1)
        self.linear2 = nn.Linear(self.hidden_size1, self.hidden_size2)

        # Output layer for regression (no activation)
        self.output_layer = nn.Linear(self.hidden_size2, 1) 

    def forward(self, x):
        x = F.relu(self.linear1(x))  
        x = F.relu(self.linear2(x))
        x = self.output_layer(x)
        return x 


In [103]:
import torch
import torch.nn as nn

# Hyperparameters
num_epochs = 100
batch_size = 64
learning_rate = 0.001
input_size = X.shape[1]  # Number of features


model = RegressionNetwork(input_size) 
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
loss_fn = nn.MAE()  # Mean Absolute Error (MAE) loss



AttributeError: module 'torch.nn' has no attribute 'l1_loss'

In [None]:


# Data Preparation (If not using a DataLoader)
train_dataset = torch.utils.data.TensorDataset(torch.tensor(X), torch.tensor(y))
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# port to GPU
model.to(device)
loss_fn.to(device)





MSELoss()

In [None]:
# Train the model on GPU
model.to(device)

# Train the model in batches
for epoch in range(num_epochs):
    for i, (inputs, targets) in enumerate(train_loader):
        inputs, targets = inputs.to(device), targets.to(device)

        # Forward pass
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1) % 100 == 0:
            print (f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}')


Epoch [1/100], Step [100/17803], Loss: nan
Epoch [1/100], Step [200/17803], Loss: nan
Epoch [1/100], Step [300/17803], Loss: nan
Epoch [1/100], Step [400/17803], Loss: nan
Epoch [1/100], Step [500/17803], Loss: nan
Epoch [1/100], Step [600/17803], Loss: nan
Epoch [1/100], Step [700/17803], Loss: nan
Epoch [1/100], Step [800/17803], Loss: nan
Epoch [1/100], Step [900/17803], Loss: nan
Epoch [1/100], Step [1000/17803], Loss: nan
Epoch [1/100], Step [1100/17803], Loss: nan
Epoch [1/100], Step [1200/17803], Loss: nan
Epoch [1/100], Step [1300/17803], Loss: nan
Epoch [1/100], Step [1400/17803], Loss: nan
Epoch [1/100], Step [1500/17803], Loss: nan
Epoch [1/100], Step [1600/17803], Loss: nan
Epoch [1/100], Step [1700/17803], Loss: nan
Epoch [1/100], Step [1800/17803], Loss: nan
Epoch [1/100], Step [1900/17803], Loss: nan
Epoch [1/100], Step [2000/17803], Loss: nan
Epoch [1/100], Step [2100/17803], Loss: nan
Epoch [1/100], Step [2200/17803], Loss: nan
Epoch [1/100], Step [2300/17803], Loss: n

KeyboardInterrupt: 

In [None]:

# Save the model
torch.save(model.state_dict(), 'model.pth')


In [None]:

# evaluate the model
model.eval()
with torch.no_grad():
    y_pred = model(X)
    loss = loss_fn(y_pred, y.view(-1, 1))
    print(f'Loss: {loss.item():.4f}')
    
    # Print the first 5 predictions
    y_pred = y_pred.cpu()
    y = y.cpu()
    print(y_pred[:5].numpy().flatten())
    print(y[:5].numpy())

Loss: nan
[nan nan nan nan nan]
[ 6.  0. 23. 10. 10.]
