In [19]:
import snowflake.snowpark.functions as F
from snowflake.snowpark.session import Session
from snowflake.snowpark.types import StructType, StructField, FloatType
from snowflake.snowpark import Session
import os
import json

In [4]:
# Assuming you have environment variables set for the credentials
connection_parameters = {
    "account": os.getenv("SNOWFLAKE_ACCOUNT"),
    "user": os.getenv("SNOWFLAKE_USER"),
    "password": os.getenv("SNOWFLAKE_PASSWORD"),
    "schema": os.getenv("SNOWFLAKE_SCHEMA"),
    "database": os.getenv("SNOWFLAKE_DATABASE"),
    "role": os.getenv("SNOWFLAKE_ROLE"),
    "warehouse": os.getenv("SNOWFLAKE_WAREHOUSE"),
}

session = Session.builder.configs(connection_parameters).create()

ProgrammingError: 251005: User is empty

In [20]:
#with open(r'C:\Users\habdullayev\Documents\GitHub\SnowflakeML\Snowflake_ML_Intro\creds.json') as f:
    #connection_parameters = json.load(f)

#session = Session.builder.configs(connection_parameters).create()
#print(f"Current Database and schema: {session.get_fully_qualified_current_schema()}")
#print(f"Current Warehouse: {session.get_current_warehouse()}")

Current Database and schema: "MLOPS"."ADVERTISING"
Current Warehouse: "COMPUTE_WH"


In [21]:
ad_df = session.table("ADVERTISING")

In [18]:
ad_df.show()

-------------------------------------------
|"TV"   |"RADIO"  |"NEWSPAPER"  |"SALES"  |
-------------------------------------------
|230_1  |37_8     |69_2         |22_1     |
|44_5   |39_3     |45_1         |10_4     |
|17_2   |45_9     |69_3         |12_0     |
|151_5  |41_3     |58_5         |16_5     |
|180_8  |10_8     |58_4         |17_9     |
|8_7    |48_9     |75_0         |7_2      |
|57_5   |32_8     |23_5         |11_8     |
|120_2  |19_6     |11_6         |13_2     |
|8_6    |2_1      |1_0          |4_8      |
|199_8  |2_6      |21_2         |15_6     |
-------------------------------------------



In [27]:
def convert_to_float(column):
    return F.regexp_replace(F.col(column), '_', '.').cast(FloatType())

In [28]:
# Apply conversion to all columns
ad_df = ad_df.with_column("TV", convert_to_float("TV")) \
             .with_column("RADIO", convert_to_float("RADIO")) \
             .with_column("NEWSPAPER", convert_to_float("NEWSPAPER")) \
             .with_column("SALES", convert_to_float("SALES"))

In [30]:
missing_values = ad_df.select(
    F.sum(F.col("TV").is_null().cast("int")).alias("TV_missing"),
    F.sum(F.col("RADIO").is_null().cast("int")).alias("RADIO_missing"),
    F.sum(F.col("NEWSPAPER").is_null().cast("int")).alias("NEWSPAPER_missing"),
    F.sum(F.col("SALES").is_null().cast("int")).alias("SALES_missing")
).collect()

In [31]:
print("Missing Values:", missing_values)

Missing Values: [Row(TV_MISSING=0, RADIO_MISSING=0, NEWSPAPER_MISSING=0, SALES_MISSING=0)]


In [32]:
medians = ad_df.agg(
    F.median("TV").alias("TV_MEDIAN"),
    F.median("RADIO").alias("RADIO_MEDIAN"),
    F.median("NEWSPAPER").alias("NEWSPAPER_MEDIAN"),
    F.median("SALES").alias("SALES_MEDIAN")
).collect()

In [33]:
tv_median = medians[0]["TV_MEDIAN"]
radio_median = medians[0]["RADIO_MEDIAN"]
newspaper_median = medians[0]["NEWSPAPER_MEDIAN"]
sales_median = medians[0]["SALES_MEDIAN"]


In [34]:
ad_df = ad_df.na.fill({
    "TV": tv_median,
    "RADIO": radio_median,
    "NEWSPAPER": newspaper_median,
    "SALES": sales_median
})

In [36]:
ad_df.show()

-------------------------------------------
|"TV"   |"RADIO"  |"NEWSPAPER"  |"SALES"  |
-------------------------------------------
|230.1  |37.8     |69.2         |22.1     |
|44.5   |39.3     |45.1         |10.4     |
|17.2   |45.9     |69.3         |12.0     |
|151.5  |41.3     |58.5         |16.5     |
|180.8  |10.8     |58.4         |17.9     |
|8.7    |48.9     |75.0         |7.2      |
|57.5   |32.8     |23.5         |11.8     |
|120.2  |19.6     |11.6         |13.2     |
|8.6    |2.1      |1.0          |4.8      |
|199.8  |2.6      |21.2         |15.6     |
-------------------------------------------



In [37]:
ad_df.write.mode("overwrite").save_as_table("ADVERTISING_FEATURED")