In [1]:
### Dependencies ###

# Snowpark for Python
from snowflake.snowpark import Session
from snowflake.snowpark.version import VERSION
from snowflake.snowpark.functions import col, count, when, mean, lit, corr
from snowflake.snowpark.types import StringType, LongType, DecimalType


# Snowflake ML
from snowflake.ml.modeling.metrics.correlation import correlation
from snowflake.ml.modeling.pipeline import Pipeline
from snowflake.ml.modeling.preprocessing import OneHotEncoder, MinMaxScaler
from snowflake.ml.modeling.linear_model import LogisticRegression, LogisticRegressionCV

# Workflow
import json
import joblib

In [2]:
### Secured connection to Snowflake ###
connection_parameters = json.load(open('connection.json'))
session = Session.builder.configs(connection_parameters).create()
session.sql_simplifier_enabled = True

database = 'FRAUD_DETECT_DB'
schema = 'FRAUD_DETECT_SM'
table = 'FRAUD_DATA_CLEANED'
input_tbl = f"{database}.{schema}.{table}"
fraud_data = session.table(input_tbl)
#fraud_data.show()

In [3]:
fraud_data.schema.fields

[StructField('FRAUD_BOOL', LongType(), nullable=True),
 StructField('INCOME', LongType(), nullable=True),
 StructField('NAME_EMAIL_SIMILARITY', LongType(), nullable=True),
 StructField('CUSTOMER_AGE', LongType(), nullable=True),
 StructField('DAYS_SINCE_REQUEST', LongType(), nullable=True),
 StructField('INTENDED_BALCON_AMOUNT', LongType(), nullable=True),
 StructField('PAYMENT_TYPE', StringType(250), nullable=True),
 StructField('ZIP_COUNT_4W', LongType(), nullable=True),
 StructField('VELOCITY_6H', LongType(), nullable=True),
 StructField('VELOCITY_24H', LongType(), nullable=True),
 StructField('VELOCITY_4W', LongType(), nullable=True),
 StructField('BANK_BRANCH_COUNT_8W', LongType(), nullable=True),
 StructField('DATE_OF_BIRTH_DISTINCT_EMAILS_4W', LongType(), nullable=True),
 StructField('EMPLOYMENT_STATUS', StringType(250), nullable=True),
 StructField('CREDIT_RISK_SCORE', LongType(), nullable=True),
 StructField('EMAIL_IS_FREE', LongType(), nullable=True),
 StructField('HOUSING_ST

Let's start with a Logistic Regression Model

In [4]:
### Categorical and Numerical cols
cat_cols = [field.name for field in fraud_data.schema.fields if not isinstance(field.datatype,(LongType,DecimalType))]
num_cols = [field.name for field in fraud_data.schema.fields if isinstance(field.datatype,LongType)]
one_hot_output_cols = [f"{col}_encoded" for col in cat_cols ]
min_max_output_cols = [f"{col}_scaled" for col in num_cols]

In [27]:
### Pipeline for data preprocessing ###
log_reg_pipeline = Pipeline(steps=
                [(
                    "OneHotEncoder",
                    OneHotEncoder(
                        input_cols = cat_cols,
                        output_cols = one_hot_output_cols
                    )
                ),
                (
                    "MinMaxScale",
                    MinMaxScaler(
                        input_cols = num_cols,
                        output_cols = min_max_output_cols
                    )
                )])

PIPELINE_FILE = 'log_reg_pipeline.joblib'
joblib.dump(log_reg_pipeline,PIPELINE_FILE) # to serialize job
put_result = session.file.put(PIPELINE_FILE, '@FRAUD_DETECT_DB.FRAUD_DETECT_SM.INTERNAL_FRAUD_STG', overwrite=True) # job staged to SF

  snowpark_dataframe_utils.cast_snowpark_dataframe_column_types(dataset)
  snowpark_dataframe_utils.cast_snowpark_dataframe_column_types(dataset)


In [36]:
### Train Test Split ###
train_df,test_df = fraud_data.random_split(weights=[0.8,0.2],seed=42)
columns_to_remove = cat_cols + num_cols # keeping preprocessed columns only
train_df = log_reg_pipeline.fit(train_df).transform(train_df).drop(columns_to_remove)
test_df = log_reg_pipeline.transform(test_df).drop(columns_to_remove)

  snowpark_dataframe_utils.cast_snowpark_dataframe_column_types(dataset)
  snowpark_dataframe_utils.cast_snowpark_dataframe_column_types(dataset)
  snowpark_dataframe_utils.cast_snowpark_dataframe_column_types(dataset)


In [38]:
train_df.schema.fields

[StructField('FRAUD_BOOL_SCALED', DoubleType(), nullable=True),
 StructField('INCOME_SCALED', DoubleType(), nullable=True),
 StructField('NAME_EMAIL_SIMILARITY_SCALED', DoubleType(), nullable=True),
 StructField('CUSTOMER_AGE_SCALED', DoubleType(), nullable=True),
 StructField('DAYS_SINCE_REQUEST_SCALED', DoubleType(), nullable=True),
 StructField('INTENDED_BALCON_AMOUNT_SCALED', DoubleType(), nullable=True),
 StructField('ZIP_COUNT_4W_SCALED', DoubleType(), nullable=True),
 StructField('VELOCITY_6H_SCALED', DoubleType(), nullable=True),
 StructField('VELOCITY_24H_SCALED', DoubleType(), nullable=True),
 StructField('VELOCITY_4W_SCALED', DoubleType(), nullable=True),
 StructField('BANK_BRANCH_COUNT_8W_SCALED', DoubleType(), nullable=True),
 StructField('DATE_OF_BIRTH_DISTINCT_EMAILS_4W_SCALED', DoubleType(), nullable=True),
 StructField('CREDIT_RISK_SCORE_SCALED', DoubleType(), nullable=True),
 StructField('EMAIL_IS_FREE_SCALED', DoubleType(), nullable=True),
 StructField('PHONE_HOME_VA

Remember we were using the 'newton_cholesky' method in the local training, let's use the same penalty (l2) for the cloud training, still using the same random state

In [41]:
### Model Definition ###
# Logistic Regression
feature_cols = train_df.columns.remove('FRAUD_BOOL_SCALED')
label_cols = 'FRAUD_BOOL_SCALED'
output_cols = ['PREDICTED_FRAUD']

log_reg_model = LogisticRegression(random_state=42,
                                   solver='lbfgs',
                                   input_cols = feature_cols,
                                   label_cols =label_cols,
                                   output_cols =output_cols) # default penalty : 'l2'


log_reg_model.fit(train_df)
predictions = log_reg_model.predict(test_df)


Got error object of type 'NoneType' has no len() when trying to read default values from function: <function SnowparkModelTrainer._build_fit_wrapper_sproc.<locals>.fit_wrapper_function at 0x7f026b9d91b0>. Proceeding without creating optional arguments
The version of package 'snowflake-snowpark-python' in the local environment is 1.22.1, which does not fit the criteria for the requirement 'snowflake-snowpark-python'. Your UDF might not work when the package version is different between the server and your local environment.


<snowflake.ml.modeling.linear_model.logistic_regression.LogisticRegression at 0x7f026b277b80>