In [1]:
#api_url = f"https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/birdwood%2C%20south%20australia?unitGroup=metric&include=days&key=CBWQ6NTRW2GUF443E4WZ7XLGM&contentType=csv"
#response = requests.get(api_url)
#csv_data = response.content
#df = pd.read_csv(io.BytesIO(csv_data))

In [2]:
# Snowpark for Python
import pandas as pd
import numpy as np
from snowflake.snowpark.session import Session
from snowflake.snowpark.types import Variant
from snowflake.snowpark.functions import udf,sum,col,array_construct,month,year,call_udf,lit
from snowflake.snowpark.version import VERSION

# Snowpark ML
from snowflake.ml.modeling.compose import ColumnTransformer
from snowflake.ml.modeling.pipeline import Pipeline
from snowflake.ml.modeling.preprocessing import PolynomialFeatures, StandardScaler
from snowflake.ml.modeling.linear_model import LinearRegression
from snowflake.ml.modeling.model_selection import GridSearchCV
import plotly.figure_factory as ff
# Misc
import json
import logging 
logger = logging.getLogger("snowflake.snowpark.session")
logger.setLevel(logging.ERROR)
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Create Snowflake Session object
connection_parameters = json.load(open('connection.json'))
session = Session.builder.configs(connection_parameters).create()
session.sql_simplifier_enabled = True

snowflake_environment = session.sql('select current_user(), current_version()').collect()
snowpark_version = VERSION

# Current Environment Details
print('User                        : {}'.format(snowflake_environment[0][0]))
print('Role                        : {}'.format(session.get_current_role()))
print('Database                    : {}'.format(session.get_current_database()))
print('Schema                      : {}'.format(session.get_current_schema()))
print('Warehouse                   : {}'.format(session.get_current_warehouse()))
print('Snowflake version           : {}'.format(snowflake_environment[0][1]))
print('Snowpark for Python version : {}.{}.{}'.format(snowpark_version[0],snowpark_version[1],snowpark_version[2]))

User                        : JAY_DTA_AWS
Role                        : "ACCOUNTADMIN"
Database                    : "DASH_DB"
Schema                      : "DASH_SCHEMA"
Warehouse                   : "DASH_L"
Snowflake version           : 7.27.1
Snowpark for Python version : 1.5.1


In [5]:
df = session.table('melbourne')
df = df[['EVAPORATION','MINTEMP','RAINFALL','WINDGUST','HUMIDITY']]

In [6]:
df = df.to_pandas()  

In [7]:
df.head()

Unnamed: 0,EVAPORATION,MINTEMP,RAINFALL,WINDGUST,HUMIDITY
0,18,22,7,39,64
1,16,30,7,26,75
2,15,22,6,33,55
3,17,23,9,39,55
4,17,24,7,43,72


In [8]:
df = df[df['WINDGUST'] <= 60]
df = df[df['HUMIDITY'] >= 40]

## ML Modelling

In [9]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Assuming you have loaded your dataset into 'df'

X = df.drop('EVAPORATION', axis=1)
y = df['EVAPORATION']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Hyperparameter tuning using RandomizedSearchCV
param_dist = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

random_search = RandomizedSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_distributions=param_dist,
    n_iter=100,
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)
best_regressor = random_search.best_estimator_

# Fit the best model on the training data
best_regressor.fit(X_train, y_train)

# Make predictions on the test data
y_pred = best_regressor.predict(X_test)

# Calculate performance metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Best Model Parameters:", best_regressor.get_params())
print("Mean Squared Error:", mse)
print("R-squared:", r2)



Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Model Parameters: {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': 10, 'max_features': 'log2', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 5, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 200, 'n_jobs': None, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}
Mean Squared Error: 7.1547742824557385
R-squared: 0.6430580313429655


In [10]:
pred = best_regressor.predict([[23,7,60,80]])
ans = round(pred[0],2)
ans

15.44

In [11]:
import pickle

data = {"model":best_regressor}
with open('vineyard.pkl','wb') as file:
    pickle.dump(data,file)
    
with open('vineyard.pkl','rb') as file:
    data = pickle.load(file)
    
melbourne_model = data['model']
melbourne_model.predict([[23,7,60,80]])

array([15.44178033])