# Build and Optimize a Machine Learning Models in Snowflake Notebooks with Streamlit

In this notebook, we'll build and optimize machine learning models. We'll also sprinkle in UI interactivity with Streamlit widgets to allow users to experiment and play with the parameters and settings.

## Libraries used
- `streamlit` - build the frontend UI
- `pandas` - handle and wrangle data
- `numpy` - numerical computing
- `scikit-learn` - build machine learning models
- `altair` - data visualization

## Protocol
Here's a breakdown of what we'll be doing:
1. Load and prepare a dataset for modeling.
2. Perform grid search hyperparameter optimization using the radial basis function (RBF) kernel with the support vector machine (SVM) algorithm.
3. Visualize the hyperparameter optimization via a heatmap and line chart.


## Build the ML Hyperparameter Optimization App using Streamlit

In [None]:
import streamlit as st
import pandas as pd
import numpy as np
import altair as alt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.datasets import load_wine
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

st.title('ML Hyperparameter Optimization')

# Load wine dataset
dataset = load_wine()
X = dataset.data
y = dataset.target
feature_names = dataset.feature_names

# Create DataFrame
df = pd.DataFrame(X, columns=feature_names)
df['target'] = y

# Display dataset info using metrics
st.header('📖 Dataset Information')
col1, col2, col3 = st.columns(3)
with col1:
    st.metric("Number of features", len(feature_names))
with col2:
    st.metric("Number of classes", len(dataset.target_names))
with col3:
    st.metric("Number of samples", len(y))

# Display class names
formatted_classes = ", ".join([f"`{i+1}`" for i in range(len(dataset.target_names))])
st.write(f"Classes: {formatted_classes}")

# Display sample of the data
with st.expander("👀 See the dataset"):
    st.write(df.head())

# Model hyperparameters using powers of 2
st.header('⚙️ Hyperparameters')

# Parameter range selection
st.subheader("Parameter Ranges (in powers of 2)")
col1, col2 = st.columns(2)

# Create list of powers of 2
powers = list(range(-10, 11, 2))

with col1:
    C_power_range = st.select_slider(
        'C (Regularization) range - powers of 2',
        options=powers,
        value=(-4, 4),
        help='C = 2^value'
    )
    st.info(f'''
    C range: $2^{{{C_power_range[0]}}}$ to $2^{{{C_power_range[1]}}}$
    
    {2**C_power_range[0]:.6f} to {2**C_power_range[1]:.6f}
    ''')

with col2:
    gamma_power_range = st.select_slider(
        'γ range - powers of 2',
        options=powers,
        value=(-4, 4),
        help='gamma = 2^value'
    )
    st.info(f'''
    γ range: $2^{{{gamma_power_range[0]}}}$ to $2^{{{gamma_power_range[1]}}}$
    
    {2**gamma_power_range[0]:.6f} to {2**gamma_power_range[1]:.6f}
    ''')

# Step size selection
st.subheader("Step Size for Grid Search")
col1, col2, col3 = st.columns(3)

with col1:
    C_step = st.slider('C step size', 0.1, 2.0, 0.5, 0.1)
with col2:
    gamma_step = st.slider('Gamma step size', 0.1, 2.0, 0.5, 0.1)
with col3:
    test_size = st.slider('Test size', 0.1, 0.5, 0.2)

st.divider()

# Split and scale data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create parameter grid using powers of 2 with specified step sizes
def create_param_range(start_power, end_power, step):
    powers = np.arange(start_power, end_power + step, step)
    return np.power(2, powers)

C_range = create_param_range(C_power_range[0], C_power_range[1], C_step)
gamma_range = create_param_range(gamma_power_range[0], gamma_power_range[1], gamma_step)

# Train model with GridSearchCV
param_grid = {
    'C': C_range,
    'gamma': gamma_range
}

svm = SVC(kernel='rbf', random_state=42)
grid = GridSearchCV(svm, param_grid, cv=5)
grid.fit(X_train_scaled, y_train)

# Results
y_pred = grid.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)

# Display metrics in columns
metrics1, metrics2, metrics3 = st.columns(3)
with metrics1:
    st.header('Model Performance')
    st.metric("Accuracy", f"{accuracy:.2f}")
with metrics2:
    best_C_power = np.log2(grid.best_params_['C'])
    st.header('Best Parameters')
    st.write("C")
    st.write(f"$2^{{{best_C_power:.1f}}}$ = {grid.best_params_['C']:.6f}")
    st.write(f"")
with metrics3:
    best_gamma_power = np.log2(grid.best_params_['gamma'])
    st.header('󠀠󠀠‎')
    st.write("γ")
    st.write(f"$2^{{{best_gamma_power:.1f}}}$ = {grid.best_params_['gamma']:.6f}")

# Create visualization data with means and standard deviations
results = pd.DataFrame(grid.cv_results_)
param_results = pd.DataFrame({
    'C': np.log2(results['param_C']),
    'gamma': np.log2(results['param_gamma']),
    'score': results['mean_test_score']
})

# Calculate means and standard errors for C
C_stats = param_results.groupby('C').agg({
    'score': ['mean', 'std', 'count']
}).reset_index()
C_stats.columns = ['C', 'mean_score', 'std_score', 'count']
C_stats['stderr'] = C_stats['std_score'] / np.sqrt(C_stats['count'])
C_stats['ci_upper'] = C_stats['mean_score'] + (2 * C_stats['stderr'])
C_stats['ci_lower'] = C_stats['mean_score'] - (2 * C_stats['stderr'])

# Calculate means and standard errors for gamma
gamma_stats = param_results.groupby('gamma').agg({
    'score': ['mean', 'std', 'count']
}).reset_index()
gamma_stats.columns = ['gamma', 'mean_score', 'std_score', 'count']
gamma_stats['stderr'] = gamma_stats['std_score'] / np.sqrt(gamma_stats['count'])
gamma_stats['ci_upper'] = gamma_stats['mean_score'] + (2 * gamma_stats['stderr'])
gamma_stats['ci_lower'] = gamma_stats['mean_score'] - (2 * gamma_stats['stderr'])

# Create heatmap
st.header("Hyperparameter optimization")
color_schemes = ['yellowgreenblue', 'spectral', 'viridis', 'inferno', 'magma', 'plasma', 'turbo', 'greenblue', 'blues', 'reds', 'greens', 'purples', 'oranges']
selected_color = st.selectbox('Select heatmap color scheme:', color_schemes)

# Create heatmap with grid lines and selected color scheme
heatmap = alt.Chart(param_results).mark_rect().encode(
    x=alt.X('C:Q', 
            title='C parameter', 
            scale=alt.Scale(domain=[C_power_range[0], C_power_range[1]]),
            axis=alt.Axis(grid=True, gridDash=[5,5])),
    y=alt.Y('gamma:Q', 
            title='γ parameter', 
            scale=alt.Scale(domain=[gamma_power_range[0], gamma_power_range[1]]),
            axis=alt.Axis(grid=True, gridDash=[5,5])),
    color=alt.Color('score:Q', 
                   title='Cross-validation Score',
                   scale=alt.Scale(scheme=selected_color)),
    tooltip=['C', 'gamma', alt.Tooltip('score:Q', format='.3f')]
).transform_window(
    row_number='row_number()'
).transform_fold(['score']
).properties(
    width=900,
    height=300,
)

# Add grid lines as a separate layer
grid = alt.Chart(param_results).mark_rule(color='darkgray', strokeOpacity=0.2).encode(
    x='C:Q'
).properties(
    width=900,
    height=300
) + alt.Chart(param_results).mark_rule(color='darkgray', strokeOpacity=0.2).encode(
    y='gamma:Q'
).properties(
    width=900,
    height=300
)

# Combine heatmap and grid
final_heatmap = (heatmap + grid)
st.altair_chart(final_heatmap)

# Define common Y axis title
y_axis_title = 'Cross-validation Score'

# Create C parameter plot with error bands
c_line_base = alt.Chart(C_stats)

c_line = c_line_base.mark_line().encode(
    x=alt.X('C:Q', title='C parameter', 
            scale=alt.Scale(domain=[C_power_range[0], C_power_range[1]])),
    y=alt.Y('mean_score:Q', title=y_axis_title, scale=alt.Scale(zero=False))
)

c_points = c_line_base.mark_point(size=50).encode(
    x='C:Q',
    y=alt.Y('mean_score:Q', title=y_axis_title),
    tooltip=[
        alt.Tooltip('C:Q', title='C', format='.1f'),
        alt.Tooltip('mean_score:Q', title='Mean Score', format='.3f'),
        alt.Tooltip('std_score:Q', title='Std Dev', format='.3f')
    ]
)

c_errorbars = c_line_base.mark_errorbar().encode(
    x='C:Q',
    y=alt.Y('ci_lower:Q', title=y_axis_title),
    y2='ci_upper:Q'
)

c_band = c_line_base.mark_area(opacity=0.3).encode(
    x='C:Q',
    y=alt.Y('ci_lower:Q', title=y_axis_title),
    y2='ci_upper:Q'
)

c_plot = (c_band + c_line + c_errorbars + c_points).properties(
    width=400,
    height=300,
)

# Create gamma parameter plot with error bands
gamma_line_base = alt.Chart(gamma_stats)

gamma_line = gamma_line_base.mark_line().encode(
    x=alt.X('gamma:Q', title='γ parameter', 
            scale=alt.Scale(domain=[gamma_power_range[0], gamma_power_range[1]])),
    y=alt.Y('mean_score:Q', title=y_axis_title, scale=alt.Scale(zero=False))
)

gamma_points = gamma_line_base.mark_point(size=50).encode(
    x='gamma:Q',
    y=alt.Y('mean_score:Q', title=y_axis_title),
    tooltip=[
        alt.Tooltip('gamma:Q', title='Gamma', format='.1f'),
        alt.Tooltip('mean_score:Q', title='Mean Score', format='.3f'),
        alt.Tooltip('std_score:Q', title='Std Dev', format='.3f')
    ]
)

gamma_errorbars = gamma_line_base.mark_errorbar().encode(
    x='gamma:Q',
    y=alt.Y('ci_lower:Q', title=y_axis_title),
    y2='ci_upper:Q'
)

gamma_band = gamma_line_base.mark_area(opacity=0.3).encode(
    x='gamma:Q',
    y=alt.Y('ci_lower:Q', title=y_axis_title),
    y2='ci_upper:Q'
)

gamma_plot = (gamma_band + gamma_line + gamma_errorbars + gamma_points).properties(
    width=400,
    height=300,
)

col = st.columns(2)
with col[0]:
    st.altair_chart(c_plot)
with col[1]:
    st.altair_chart(gamma_plot)

## Resources

- An overview of [Snowflake Notebooks](https://www.snowflake.com/en/data-cloud/notebooks/) and its capabilities.
- About [Snowflake Notebooks](https://docs.snowflake.com/en/user-guide/ui-snowsight/notebooks) in the [Snowflake Documentation](https://docs.snowflake.com/).
- Further information on the use of Streamlit can be found at the [Streamlit Docs](https://docs.streamlit.io/).