In [11]:
### Dependencies ###

# Snowpark for Python
from snowflake.snowpark import Session
from snowflake.snowpark.version import VERSION
from snowflake.snowpark.functions import col, count, when, mean, lit, corr
from snowflake.snowpark.types import StringType, LongType, DecimalType


# Snowflake ML
from snowflake.ml.modeling.metrics.correlation import correlation
from snowflake.ml.modeling.pipeline import Pipeline
from snowflake.ml.modeling.preprocessing import OneHotEncoder, MinMaxScaler
from snowflake.ml.modeling.linear_model import LogisticRegression, LogisticRegressionCV
from snowflake.ml.modeling.metrics import accuracy_score, precision_score

# Workflow
import json
import joblib

In [2]:
### Secured connection to Snowflake ###
connection_parameters = json.load(open('connection.json'))
session = Session.builder.configs(connection_parameters).create()
session.sql_simplifier_enabled = True

database = 'FRAUD_DETECT_DB'
schema = 'FRAUD_DETECT_SM'
table = 'FRAUD_DATA_CLEANED'
input_tbl = f"{database}.{schema}.{table}"
fraud_data = session.table(input_tbl)

Let's start with a Logistic Regression Model

In [4]:
### Categorical and Numerical cols ###
cat_cols = [field.name for field in fraud_data.schema.fields if not isinstance(field.datatype,(LongType,DecimalType))]
num_cols = [field.name for field in fraud_data.schema.fields if isinstance(field.datatype,LongType)]
one_hot_output_cols = [f"{col}_encoded" for col in cat_cols ]
min_max_output_cols = [f"{col}_scaled" for col in num_cols]

In [5]:
### Pipeline for data preprocessing ###
log_reg_pipeline = Pipeline(steps=
                [(
                    "OneHotEncoder",
                    OneHotEncoder(
                        input_cols = cat_cols,
                        output_cols = one_hot_output_cols
                    )
                ),
                (
                    "MinMaxScale",
                    MinMaxScaler(
                        input_cols = num_cols,
                        output_cols = min_max_output_cols
                    )
                )])

PIPELINE_FILE = 'log_reg_pipeline.joblib'
joblib.dump(log_reg_pipeline,PIPELINE_FILE) # to serialize job
put_result = session.file.put(PIPELINE_FILE, '@FRAUD_DETECT_DB.FRAUD_DETECT_SM.INTERNAL_FRAUD_STG', overwrite=True) # job staged to SF

In [6]:
### Train Test Split ###
train_df,test_df = fraud_data.random_split(weights=[0.8,0.2],seed=42)
columns_to_remove = cat_cols + num_cols # keeping preprocessed columns only
train_df = log_reg_pipeline.fit(train_df).transform(train_df).drop(columns_to_remove)
test_df = log_reg_pipeline.transform(test_df).drop(columns_to_remove)

  snowpark_dataframe_utils.cast_snowpark_dataframe_column_types(dataset)
  snowpark_dataframe_utils.cast_snowpark_dataframe_column_types(dataset)
  snowpark_dataframe_utils.cast_snowpark_dataframe_column_types(dataset)


Remember we were using the 'newton_cholesky' method in the local training, let's use the same penalty (l2) for the cloud training, still using the same random state

In [22]:
### Model Definition ###
# Logistic Regression
feature_cols = train_df.columns.remove('FRAUD_BOOL_SCALED')
label_cols = 'FRAUD_BOOL_SCALED'
output_cols = ['PREDICTED_FRAUD']

log_reg_model = LogisticRegression(random_state=42,
                                   solver='newton-cholesky',
                                   input_cols = feature_cols,
                                   label_cols =label_cols,
                                   output_cols =output_cols) # default penalty : 'l2'


log_reg_model.fit(train_df)
predictions = log_reg_model.predict(test_df) # test_df with predictions as extra column


Got error object of type 'NoneType' has no len() when trying to read default values from function: <function SnowparkModelTrainer._build_fit_wrapper_sproc.<locals>.fit_wrapper_function at 0x70d5eb5601f0>. Proceeding without creating optional arguments
The version of package 'snowflake-snowpark-python' in the local environment is 1.22.1, which does not fit the criteria for the requirement 'snowflake-snowpark-python'. Your UDF might not work when the package version is different between the server and your local environment.


In [23]:
accuracy_score(df=predictions,y_true_col_names='FRAUD_BOOL_SCALED',y_pred_col_names='PREDICTED_FRAUD') # 79%, approximately same accuracy as before (with the same random state) !!
precision_score(df=predictions,y_true_col_names='FRAUD_BOOL_SCALED',y_pred_col_names='PREDICTED_FRAUD')

0.791009

In [27]:
### Let's go further and train a cross validated model ###
# log_reg_model_cv = LogisticRegressionCV(random_state=42,
#                                      solver='newton-cholesky',
#                                      input_cols= feature_cols,
#                                      label_cols = label_cols,
#                                      output_cols = output_cols,
#                                      cv=10)

# log_reg_model_cv.fit(train_df)
# predictions_cv = log_reg_model.predict(test_df) # test_df with predictions as extra column # TOO LONG ... MORE THAN 5 MINS