#  **SWT 2024 Evaluating your Machine Learning Models in Snowflake**
### Notebook 0 - Data Import
---
### What We'll Do:
In this notebook, we will use External Access Integration to load data Our data source is a GitHub repository, from which we'll fetch and directly store the data into our Snowflake account. No S3 buckets or local downloads are needed. 

Please remember to add your External Access in the Notebook. You can access this by clicking on the &#8942; dropdown &#8594; External Access &#8594; enable `GITHUB_EXTERNAL_ACCESS_INTEGRATION`

Our goal is to simplify the execution of this demo while showcasing the extensive capabilities of Snowflake!

## THIS WILL ONLY WORK FOR FULL SNOWFLAKE ACCOUNT AND NOT A TRIAL ACCOUNT! 

In [None]:
from snowflake.snowpark.context import get_active_session
import requests
import pandas as pd
from snowflake.snowpark import DataFrame
from io import StringIO

session = get_active_session()

In [None]:
snowflake_environment = session.sql('select current_user(), current_version()').collect()
from snowflake.snowpark.version import VERSION

# Current Environment Details
print('User                        : {}'.format(snowflake_environment[0][0]))
print('Role                        : {}'.format(session.get_current_role()))
print('Database                    : {}'.format(session.get_current_database()))
print('Schema                      : {}'.format(session.get_current_schema()))
print('Warehouse                   : {}'.format(session.get_current_warehouse()))
print('Snowflake version           : {}'.format(snowflake_environment[0][1]))
print('Snowpark for Python version : {}.{}.{}'.format(VERSION[0],VERSION[1],VERSION[2]))

In [None]:
def fetch_dataset_from_github(url: str) -> 'DataFrame':
    # Fetch the CSV data from the URL
    response = requests.get(url)
    if response.status_code == 200:
        # Decode the content and read into a Pandas DataFrame
        csv_data = response.content.decode('utf-8')
        csv_file = StringIO(csv_data)
        pandas_df = pd.read_csv(csv_file)
        
        # Convert Pandas DataFrame to Snowpark DataFrame
        return session.create_dataframe(pandas_df)
    else:
        raise Exception(f"Failed to fetch CSV: {response.status_code} - {response.text}")

In [None]:
# Once an updated file is on the github dataset repo, it's enoguh to run this cell to reload the new datasets.
from snowflake.snowpark.functions import col

# If your data is saved somewhere else, change this to the right schema
data_schema = 'DATA'

# Function to rename columns to uppercase
def rename_columns_to_uppercase(df):
    # Generate a list of columns with uppercase names
    new_columns = [col(c).alias(c.upper()) for c in df.columns]
    # Select columns with new names
    return df.select(*new_columns)

# Base URL and list of files
url_base = 'https://github.com/MrHarBear/Evaluate_Snowflake_ML_Model/raw/main/datasets/'
url_files = [
    'claim_data.csv',
    'claim_data_new.csv',
    'customer_data.csv'
]

# Loop through each URL
for url in url_files:
    # Get Snowpark DataFrame from the URL
    df = fetch_dataset_from_github(url_base + url)

    # Extract table name from URL
    table_name = url.split('/')[-1].replace('.csv', '').upper()

    full_path = session.get_current_database().strip('"') + '.' + data_schema + '.' + table_name
    print(full_path)
    # Drop the table if it exists
    session.sql(f"DROP TABLE IF EXISTS {full_path}").collect()

    # Convert column names to uppercase
    df = rename_columns_to_uppercase(df)

    # Create table and insert data from Snowpark DataFrame
    df.write.save_as_table(full_path, mode='overwrite')

    print(f"Table {table_name} created and data loaded successfully.")
    session.table(full_path).show(5)