In [1]:
from snowflake.snowpark.session import Session
from snowflake.snowpark import functions as F
from snowflake.snowpark.types import *

import pandas as pd

from sklearn import linear_model

import matplotlib.pyplot as plt

%matplotlib inline
import datetime as dt
import numpy as np
import seaborn as sns

#Snowflake connection info is saved in config.py
from config import snowflake_conn_prop


# lets import some tranformations functions
from snowflake.snowpark.functions import udf, col, lit, translate, is_null, iff

In [2]:
from snowflake.snowpark import version
print(version.VERSION)

(0, 7, 0)


Let's configure our Snowpark Session and initialize the database, warehouse, and schema that we will use for the remainder of the quickstart.

In [3]:
from snowflake.snowpark import version
print(version.VERSION)
#session.close()
session = Session.builder.configs(snowflake_conn_prop).create()
session.sql("use role accountadmin").collect()
session.sql("create database if not exists  {}".format(snowflake_conn_prop['database'])).collect()
session.sql("use database {}".format(snowflake_conn_prop['database'])).collect()
session.sql("create schema if not exists {}".format(snowflake_conn_prop['schema'])).collect()
session.sql("use schema {}".format(snowflake_conn_prop['schema'])).collect()
session.sql("create or replace warehouse {} with \
                WAREHOUSE_SIZE = XSMALL \
                AUTO_SUSPEND = 120 \
                AUTO_RESUME = TRUE".format(snowflake_conn_prop['warehouse'])).collect()
session.sql("use warehouse {}".format(snowflake_conn_prop['warehouse']))
print(session.sql('select current_warehouse(), current_database(), current_schema()').collect())

(0, 7, 0)
[Row(CURRENT_WAREHOUSE()='SNOWPARK_WH', CURRENT_DATABASE()='SNOWPARKDEMO_CHURN_DB', CURRENT_SCHEMA()='CHURN_CUSTOMER_SCHEMA')]


## Infer file schema & Load Data into snowflake

In [4]:
filename = "raw_telco_data.parquet"
stagename = "rawdata"
rawtable = "RAW_PARQUET_DATA"

In [5]:
session.sql(f"create or replace stage {stagename} DIRECTORY = (ENABLE = TRUE);").collect()
session.file.put(filename,stagename)

[PutResult(source='raw_telco_data.parquet', target='raw_telco_data.parquet', source_size=3037540, target_size=3037552, source_compression='PARQUET', target_compression='PARQUET', status='UPLOADED', message='')]

In [6]:
session.sql("CREATE OR REPLACE FILE FORMAT MY_PARQUET_FORMAT TYPE = PARQUET;").collect()

session.sql(f"CREATE OR REPLACE \
            TABLE {rawtable} USING TEMPLATE ( \
                SELECT ARRAY_AGG(OBJECT_CONSTRUCT(*)) \
                FROM \
                    TABLE( INFER_SCHEMA( \
                    LOCATION => '@{stagename}/{filename}', \
                    FILE_FORMAT => 'MY_PARQUET_FORMAT' \
                    ) \
                ) \
            );  ").collect()

[Row(status='Table RAW_PARQUET_DATA successfully created.')]

## For incremental load 

you might want to just delete all rows and load new data to this table

In [7]:
dfClear = session.table(rawtable).delete()

In [8]:
dfRaw = session.read.option("compression","snappy").parquet(f"@{stagename}/{filename}")
dfRaw.copy_into_table(rawtable,FORCE= True)

[Row(file='rawdata/raw_telco_data.parquet', status='LOADED', rows_parsed=100000, rows_loaded=100000, error_limit=1, errors_seen=0, first_error=None, first_error_line=None, first_error_character=None, first_error_column_name=None)]

In [9]:
dfR = session.table(rawtable).sample(n=5)
dfR.toPandas()

Unnamed: 0,COUNTRY,CITY,PHONE SERVICE,MULTIPLE LINES,LATITUDE,ONLINE SECURITY,SENIOR CITIZEN,MONTHLY CHARGES,STREAMING MOVIES,PAYMENT METHOD,...,CHURN SCORE,GENDER,LONGITUDE,ONLINE BACKUP,TOTAL CHARGES,CLTV,CHURN REASON,DEVICE PROTECTION,STATE,ZIP CODE
0,United States,Desert Hot Springs,No,No phone service,33.832799,Yes,False,47.6,No,Bank transfer (automatic),...,0,Female,-116.250973,Yes,3377.8,4795,do not know,Yes,California,92241
1,United States,Sheridan,Yes,Yes,38.984756,Yes,False,90.45,Yes,Mailed check,...,0,Female,-121.345074,Yes,5957.9,5509,do not know,Yes,California,95681
2,United States,Ventura,Yes,No,34.360261,No internet service,False,19.9,No internet service,Mailed check,...,0,Male,-119.306383,No internet service,791.15,4041,do not know,No internet service,California,93001
3,United States,Riverside,Yes,No,33.994676,No,False,81.45,No,Electronic check,...,0,Male,-117.372498,No,912.0,2916,do not know,No,California,92501
4,United States,Van Nuys,Yes,No,34.195685,No internet service,False,20.75,No internet service,Credit card (automatic),...,0,Male,-118.490752,No internet service,67.1,2406,do not know,No internet service,California,91406


In [10]:
session.close()