# Model Building, Scoring & Evaluating (Logistic Regression)

In [24]:
# Imports
from snowflake.snowpark.session import Session
import snowflake.snowpark.functions as F

from snowflake.ml.modeling.linear_model import LogisticRegression
from snowflake.ml.modeling.metrics import *

import json
import pandas as pd
import seaborn as sns

In [25]:
# Create Snowpark Session
with open('creds.json') as f:
    connection_parameters = json.load(f)

session = Session.builder.configs(connection_parameters).create()
print(f"Current Database and schema: {session.get_fully_qualified_current_schema()}")
print(f"Current Warehouse: {session.get_current_warehouse()}")

Current Database and schema: "CC_DB"."PUBLIC"
Current Warehouse: "CC_WH"


### Model Building

In [26]:
application_record_balanced_sdf = session.table('CREDIT_RISK_PREPARED_BALANCED')
application_record_balanced_sdf.group_by('TARGET').count().show()

----------------------
|"TARGET"  |"COUNT"  |
----------------------
|0         |29819    |
|1         |29819    |
----------------------



In [27]:
# Using stratified sampling of the data. 
# Using sample_by allows us to sample based on one column and also specify the fractions of each value in the column.

train_sdf = application_record_balanced_sdf.sample_by("TARGET", {1: 0.8, 0: 0.8})
train_sdf = train_sdf.cache_result()
train_sdf.group_by('TARGET').count().show()

----------------------
|"TARGET"  |"COUNT"  |
----------------------
|1         |23937    |
|0         |23751    |
----------------------



In [28]:
# The test data set is the rest of the rows, by using minus we can exclude the rows from the train_sdf DataFrame

test_sdf = application_record_balanced_sdf.minus(train_sdf)
test_sdf.group_by('TARGET').count().show()

----------------------
|"TARGET"  |"COUNT"  |
----------------------
|0         |6068     |
|1         |4846     |
----------------------



In [29]:
# Save the training and test data into tables

train_sdf.write.save_as_table(table_name='CREDIT_RISK_PREPARED_BALANCED_TRAIN', mode='overwrite')
test_sdf.write.save_as_table(table_name='CREDIT_RISK_PREPARED_BALANCED_TEST', mode='overwrite')

train_sdf = session.table('CREDIT_RISK_PREPARED_BALANCED_TRAIN')
test_sdf = session.table('CREDIT_RISK_PREPARED_BALANCED_TEST')

In [30]:
# Fitting a LogisticRegression model using snowpark-ml
feature_cols = train_sdf.columns
feature_cols.remove('TARGET')
feature_cols.remove('ID')
target_col = 'TARGET'

lm = LogisticRegression(
    C=0.8, 
    solver='lbfgs',
    random_state=0, 
    input_cols=feature_cols, 
    label_cols=target_col, 
    output_cols=['PREDICTION']
    )
lm.fit(train_sdf)

RuntimeError: Package lightgbm==4.6.0 is not supported in snowflake conda channel for python runtime 3.9.

In [8]:
# The fitted model can be retrieved as a scikit-learn object

lm_local = lm.to_sklearn()
lm_local

In [9]:
# Plotting Feature Coefficients

feature_coefficients = pd.DataFrame(lm_local.coef_.T,lm_local.feature_names_in_,columns=['Coefficient'])
feature_coefficients.sort_values('Coefficient').plot.barh(y='Coefficient', figsize=(5,15))

AttributeError: 'LogisticRegression' object has no attribute 'coef_'