In [32]:
### Dependencies ###

# Snowpark for Python
from snowflake.snowpark import Session
from snowflake.snowpark.version import VERSION
from snowflake.ml.modeling.metrics.correlation import correlation
from snowflake.ml.modeling.pipeline import Pipeline
from snowflake.ml.modeling.preprocessing import OneHotEncoder, MinMaxScaler

# Snowpark ML
from snowflake.snowpark.functions import col, count, when, mean, lit, corr
from snowflake.snowpark.types import StringType, LongType, DecimalType

# Workflow
import json
import joblib

In [23]:
### Secured connection to Snowflake ###
connection_parameters = json.load(open('connection.json'))
session = Session.builder.configs(connection_parameters).create()
session.sql_simplifier_enabled = True

database = 'FRAUD_DETECT_DB'
schema = 'FRAUD_DETECT_SM'
table = 'FRAUD_DATA_CLEANED'
input_tbl = f"{database}.{schema}.{table}"
fraud_data = session.table(input_tbl)
#fraud_data.show()

Let's start with a Logistic Regression Model

In [36]:
### Categorical and Numerical cols
cat_cols = [field.name for field in fraud_data.schema.fields if not isinstance(field.datatype,(LongType,DecimalType))]
num_cols = [field.name for field in fraud_data.schema.fields if isinstance(field.datatype,LongType)]
one_hot_output_cols = [f"{col}_encoded" for col in cat_cols]
min_max_output_cols = [f"{col}_scaled" for col in num_cols]

In [43]:
### Pipeline for data preprocessing ###
log_reg_pipeline = Pipeline(steps=
                [(
                    "OneHotEncoder",
                    OneHotEncoder(
                        input_cols = cat_cols,
                        output_cols = one_hot_output_cols
                    )
                ),
                (
                    "MinMaxScale",
                    MinMaxScaler(
                        input_cols = num_cols,
                        output_cols = min_max_output_cols
                    )
                )])

PIPELINE_FILE = 'log_reg_pipeline.joblib'
joblib.dump(log_reg_pipeline,PIPELINE_FILE) # to serialize job
put_result = session.file.put(PIPELINE_FILE, '@FRAUD_DETECT_DB.FRAUD_DETECT_SM.INTERNAL_FRAUD_STG', overwrite=True) # job staged to SF


#transformed_fraud_data = log_reg_pipeline.fit(fraud_data).transform(fraud_data)
#transformed_fraud_data.show()

File log_reg_pipeline.joblib exists locally.
<module 'posixpath' (frozen)>
File upload result: [PutResult(source='log_reg_pipeline.joblib', target='log_reg_pipeline.joblib.gz', source_size=2640, target_size=1296, source_compression='NONE', target_compression='GZIP', status='UPLOADED', message='')]


In [39]:
### Train Test Split ###


[PutResult(source='log_reg_pipeline.joblib', target='log_reg_pipeline.joblib.gz', source_size=2640, target_size=1296, source_compression='NONE', target_compression='GZIP', status='UPLOADED', message='')]