# Loading Tables to Snowflake environment
This file serves the purpose of loading datatables to snowflake environment in the event of porting over to new account instance.
The tables in focus will be from the Home Credit Dataset:
- bureau_feature_table
- previous_loan_features
- static_feature_table

Require the following parquet files in the respective locations for the code to work
- data/bureau_feature_table.parquet
- data/previous_loan_features.parquet
- data/static_feature_table.parquet

In [6]:
# All library installations

%pip install feast-azure-provider
%pip install azure-cli
%pip install snowflake-connector-python==2.7.4
%pip install pyarrow==6.0.1

Collecting feast-azure-provider
  Downloading feast_azure_provider-0.3.0-py3-none-any.whl (14 kB)
Collecting feast[redis]==0.18.1
  Downloading feast-0.18.1-py3-none-any.whl (275 kB)
[K     |████████████████████████████████| 275 kB 35.0 MB/s eta 0:00:01
Collecting grpcio-reflection>=1.34.0
  Downloading grpcio_reflection-1.45.0-py3-none-any.whl (14 kB)
Collecting pandavro==1.5.*
  Downloading pandavro-1.5.2.tar.gz (3.8 kB)
Collecting fastavro>=1.1.0
  Downloading fastavro-1.4.10-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 49.3 MB/s eta 0:00:01
Collecting googleapis-common-protos==1.52.*
  Downloading googleapis_common_protos-1.52.0-py2.py3-none-any.whl (100 kB)
[K     |████████████████████████████████| 100 kB 11.2 MB/s ta 0:00:01
Collecting proto-plus<1.19.7
  Downloading proto_plus-1.19.6-py3-none-any.whl (45 kB)
[K     |████████████████████████████████| 45 kB 4.1 MB/s  eta 0:00:01
[?25hCollecting pyarrow>=4.0

In [3]:
import pandas as pd
import numpy as np
import datetime
from sklearn.base import BaseEstimator, TransformerMixin
# import featuretools as ft
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
# matplotlib and seaborn for plotting
# import matplotlib.pyplot as plt
# import seaborn as sns
# from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
import warnings
warnings.filterwarnings("ignore")

In [2]:
from azureml.core import Workspace
from azureml.core import Keyvault
import os

ws = Workspace.from_config()
keyvault = ws.get_default_keyvault()

os.environ["REGISTRY_BLOB_KEY"] = keyvault.get_secret("registrytoken")

UserErrorException: UserErrorException:
	Message: The workspace configuration file config.json, could not be found in /Users/ftt.sakthivel.b/Experiments/feast-snowflake/feature-store/notebooks or its parent directories. Please check whether the workspace configuration file exists, or provide the full path to the configuration file as an argument. You can download a configuration file for your workspace, via http://ml.azure.com and clicking on the name of your workspace in the right top.
	InnerException None
	ErrorResponse 
{
    "error": {
        "code": "UserError",
        "message": "The workspace configuration file config.json, could not be found in /Users/ftt.sakthivel.b/Experiments/feast-snowflake/feature-store/notebooks or its parent directories. Please check whether the workspace configuration file exists, or provide the full path to the configuration file as an argument. You can download a configuration file for your workspace, via http://ml.azure.com and clicking on the name of your workspace in the right top."
    }
}

In [None]:
#Login Details for snowflake
username = "evan912"
pwd = keyvault.get_secret("evansftestkey")

In [4]:
#Save registry token in environment registry blob key
os.environ["REGISTRY_BLOB_KEY"] = "sp=racwdyti&st=2022-03-16T08:44:21Z&se=2022-12-09T16:44:21Z&spr=https&sv=2020-08-04&sr=b&sig=ZMx%2BpFBaemjiR4j1QRoCGqrbIAQLJ%2F8eJPnx%2Bth4PWQ%3D"

#Snowflake Credentials
username = "evan912"
pwd = 'Evan912912'

In [6]:
import snowflake.connector as snow
from snowflake.connector.pandas_tools import write_pandas

# Snowflake python connector to populate data from datasource to table in snowflake
conn = snow.connect(
   user = username,
   password = pwd,
   account="pn93812.southeast-asia.azure",
   warehouse="COMPUTE_WH",
   database="TEST",
   schema="PUBLIC"
)

#Adds Timezone to all datatime column items in the pandas
def fix_date_cols(df, tz = 'UTC'):
    cols = df.select_dtypes(include=['datetime64[ns]']).columns
    for col in cols:
        df[col] = df[col].dt.tz_localize(tz)

# Wrapper function to upload tables to snowflake
#     data: Pandas Datatable sent to Snowflake
#     table_title: Snowflake tablename
def process_send(data, table_title):
    
    # remove special characters in columns name
    data.columns = data.columns.str.replace("[' ', -]", '_')
    data.columns = data.columns.str.replace("[(, )]", '')
    
    # Add timezone to any datetime type data
    fix_date_cols(data)
    
    #Send data to Snowflake
    try:
        success, nchunks, nrows, _ = write_pandas(conn, data, table_title, quote_identifiers=False, auto_create_table=True)
        print ("Success: " + str(success) + ', Chunks: '+ str (nchunks) + ', Rows uploaded: ' + str (nrows))
    except Exception as e:
        print(e)
    print("Done.")


In [14]:
bureau = pd.read_parquet("data/bureau_feature_table.parquet")
prev = pd.read_parquet("data/previous_loan_features.parquet")
stat = pd.read_parquet("data/static_feature_table.parquet")
application_train = pd.read_csv("data/application_train.csv")

In [17]:
bureau['CREATED_TIMESTAMP']=datetime.datetime.now()
prev['CREATED_TIMESTAMP']=datetime.datetime.now()
stat['CREATED_TIMESTAMP']=datetime.datetime.now()

In [18]:
process_send(application_train, "APPLICATION_TRAIN" )

Success: True, Chunks: 1, Rows uploaded: 307511
Done.


In [19]:
process_send(bureau, "bureau_feature_table" )

Success: True, Chunks: 1, Rows uploaded: 305811
Done.


In [20]:
process_send(prev, "PREVIOUS_LOAN_FEATURES_TABLE" )

Success: True, Chunks: 1, Rows uploaded: 339587
Done.


In [21]:
process_send(stat, "static_feature_table" )

Success: True, Chunks: 1, Rows uploaded: 307511
Done.
