In [None]:
# !pip install "snowflake-connector-python[pandas]"

In [1]:
import os
from datetime import datetime
from snowflake.snowpark import Session
from snowflake.connector.pandas_tools import write_pandas
import pandas as pd
import re
import json
# Add the project root to the Python path
import sys
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(project_root)

# Now you can import from src
from preprocessing_helper import load_data_dictionary,clean_numeric_columns,apply_fill_method,subset_train_data

In [2]:

def create_snowflake_session(secret_file_path):
    with open(secret_file_path, 'r') as file:
        connection_parameters = json.load(file)
    session = Session.builder.configs(connection_parameters).create()
    return session

def execute_sql(session, sql_query):
    result = session.sql(sql_query).collect()
    df = pd.DataFrame([row.as_dict() for row in result])
    return df

def upload_to_snowflake(session, df, table_name):


    # Ensure all columns are properly formatted
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].astype(str)
        elif pd.api.types.is_numeric_dtype(df[col]):
            df[col] = pd.to_numeric(df[col], errors='coerce')
    result = session.write_pandas( 
                                df = df, 
                                table_name = table_name, 
                                schema='TRUSTED',
                                overwrite=True,
                                auto_create_table = True,
                                table_type = 'transient'
                                )
    return result

secret_file_path = '../.secret'
session = create_snowflake_session(secret_file_path)
get_raw_data_sql = 'select * from train'
df_train_raw = execute_sql(session = session, sql_query= get_raw_data_sql)
df_train_raw.to_csv('../data/train_raw.csv')

In [3]:


# config
data_dictionary_path = '../data/fundtap-data-dictionary.csv'
train_data_path = '../data/train_raw.csv'

# Load data dictionary
data_dict = load_data_dictionary(data_dictionary_path)

# Load training data
train_data = pd.read_csv(train_data_path,index_col= 0)
train_data.columns = train_data.columns.str.lower()

# Clean numeric columns
train_data = clean_numeric_columns(train_data, data_dict)

# Apply fill methods for missing values
train_data = apply_fill_method(train_data, data_dict)

# Subset data for training
train_data_subset = subset_train_data(train_data, data_dict)


  train_data = pd.read_csv(train_data_path,index_col= 0)


Applying fill methods...
Filling missing values in consumer_credit_judgements_guar1 with 0
Filling missing values in consumer_credit_insolvency_notices_guar1 with 0
Filling missing values in consumer_credit_credit_defaults_guar1 with 0
Filling missing values in consumer_credit_company_affiliations_guar1 with 0
Filling missing values in consumer_credit_file_activity_guar1 with 0
Filling missing values in consumer_credit_score_guar1 with 0
Filling missing values in consumer_credit_risk_odds_guar1 with 0
Filling missing values in consumer_credit_judgements_guar2 with 0
Filling missing values in consumer_credit_insolvency_notices_guar2 with 0
Filling missing values in consumer_credit_credit_defaults_guar2 with 0
Filling missing values in consumer_credit_company_affiliations_guar2 with 0
Filling missing values in consumer_credit_file_activity_guar2 with 0
Filling missing values in consumer_credit_score_guar2 with 0
Filling missing values in consumer_credit_risk_odds_guar2 with 0
Filling mis

In [4]:

# # Upload full cleaned data to Snowflake
upload_to_snowflake(session, train_data, 'TRAIN_DATA_CLEANED')

# # Upload subset of training columns to Snowflake
upload_to_snowflake(session, train_data_subset, 'TRAIN_DATA_SUBSET')


  success, nchunks, nrows, ci_output = write_pandas(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(str)
A value is trying to be set on a copy of a slice from a DataFr

<snowflake.snowpark.table.Table at 0x3080f7f90>

In [6]:
train_downloaded = execute_sql(session = session, sql_query= 'SELECT * FROM FUNDTAP.TRUSTED.TRAIN_DATA_SUBSET')
train_downloaded.to_csv('../data/train.csv')