In [None]:
!pip install streamlit

In [70]:
%%writefile FinalProj.py
#loading data into the data frame
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
import openpyxl
from scipy.signal import find_peaks
import plotly.graph_objects as go
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import cross_val_score, KFold
from sklearn.pipeline import Pipeline
import streamlit as st
import plotly.express as px

# Loading data sheets according to sample type
treated_waves= pd.read_excel("Sample no Filter.xlsx", sheet_name="Samples", header=None)
clean_waves= pd.read_excel("Sample no Filter.xlsx", sheet_name="Clean Water", header=None)
bw_waves= pd.read_excel("Sample no Filter.xlsx", sheet_name="Blackwater", header=None)

#grouping triplicate measurements and taking the average, then replacing the triplicate measurements with the average in the
#data frame

##blackwater
# Assume first column = sample names, rest = numeric data
bw_waves.columns = ["Sample"] + [f"Col{i}" for i in range(1, bw_waves.shape[1])]
# Extract group prefix (BW1, BW2, BW3...)
bw_waves["Group"] = bw_waves["Sample"].str.extract(r'^(BW\d+)')
# Compute averages across all numeric columns per group
bw_waves = bw_waves.groupby("Group").mean(numeric_only=True).reset_index()


##treated_water
treated_waves = treated_waves.iloc[1:].reset_index(drop=True)

# Rename columns: first col = Sample, rest = numeric data
treated_waves.columns = ["Sample"] + [f"Col{i}" for i in range(1, treated_waves.shape[1])]

# Extract group prefix: everything before the first dash
treated_waves["Group"] = treated_waves["Sample"].astype(str).str.extract(r'^([A-Za-z0-9]+)')

# Compute averages across all numeric columns per group
treated_waves = treated_waves.groupby("Group").mean(numeric_only=True).reset_index()

##clean_water
clean_waves = clean_waves.iloc[1:].reset_index(drop=True)

# Rename columns: first col = Sample, rest = numeric data
clean_waves.columns = ["Sample"] + [f"Col{i}" for i in range(1, clean_waves.shape[1])]

# Extract group prefix: everything before the first dash
clean_waves["Group"] = clean_waves["Sample"].astype(str).str.extract(r'^([A-Za-z0-9]+)')

# Compute averages across all numeric columns per group
clean_waves = clean_waves.groupby("Group").mean(numeric_only=True).reset_index()

#merging related data
tw_bw_clean_waves = pd.concat([treated_waves, bw_waves, clean_waves])


#importing turbidity measurements 
turb = pd.read_excel('physiochemical data.xlsx', sheet_name='Master Physiochem', usecols=['Turbidity (NTU)'])
#checking that data was correctly imported

#importing total suspended solids measurements 
tss = pd.read_excel('physiochemical data.xlsx', sheet_name='TSS', usecols=['TSS (% w/v)'])

#importing total solids measurements 
ts = pd.read_excel('physiochemical data.xlsx', sheet_name='TS', usecols=['%TS (w/w)'])

#missing and not missing
tss_missing = tss[tss.isnull().any(axis=1)]
tss_not_missing = tss.dropna()

#preparing scalar for KNN
scaler = StandardScaler()
tss_scaled = pd.DataFrame(scaler.fit_transform(tss_not_missing), columns = tss_not_missing.columns)

#intialize and fit KNN imputer
imputer = KNNImputer(n_neighbors=5, weights ='distance')
#here we have to make the scatter plot of the data without missing values so they aren't skewed by missing values, then we overlay the missing values 
#on the scatter plot 
#what does .fit do? Training on non-missing data ---it's machine learning and has never seen this data set before, so it has to train
imputer.fit(tss_scaled)

# function to impute and inverse transform the data
def impute_and_inverse_transform(data):
    # Ensure 'data' is always a DataFrame with proper column names
    scaled_data = pd.DataFrame(scaler.transform(data), columns=data.columns, index=data.index)
    imputed_scaled = imputer.transform(scaled_data)
    return pd.DataFrame(scaler.inverse_transform(imputed_scaled), columns=data.columns, index=data.index)

# impute missing values
tss_imputed = impute_and_inverse_transform(tss)

#missing and not missing
turb_missing = turb[turb.isnull().any(axis=1)]
turb_not_missing = turb.dropna()

#preparing scalar for KNN
scaler = StandardScaler()
turb_scaled = pd.DataFrame(scaler.fit_transform(turb_not_missing), columns = turb_not_missing.columns)

#intialize and fit KNN imputer
#imputer = KNNImputer(n_neighbors=5)
#modded above function to include 10 neighbors and weight the imputed value depending on how close the points were-the closer the point the more it 
#affects the imputing value
imputer = KNNImputer(n_neighbors=3, weights ='distance')
#here we have to make the scatter plot of the data without missing values so they aren't skewed by missing values, then we overlay the missing values 
#on the scatter plot 
#what does .fit do? Training on non-missing data ---it's machine learning and has never seen this data set before, so it has to train
imputer.fit(turb_scaled)
# impute missing values
turb_imputed = impute_and_inverse_transform(turb)

#tss and turb data was imputed, no data missing for ts

tw_tss = tss_imputed[0:96]
tw_ts = ts[0:96]
tw_turb = turb_imputed[0:96]

bw_tss = tss_imputed[96:106]
bw_ts = ts[96:106]
bw_turb = turb_imputed[96:106]

#reading sample names
sample_IDs = pd.read_excel('physiochemical data.xlsx', sheet_name='Master Physiochem', usecols=['Sample'])

#merging related data
tw_physio = pd.concat([tw_tss, tw_ts,tw_turb], axis=1)
bw_physio = pd.concat([bw_tss, bw_ts,bw_turb], axis=1)
tw_bw_physio = pd.concat([ tw_physio,bw_physio])

# Add the Sample IDs as a new column
tw_bw_physio.insert(0, 'Sample', sample_IDs['Sample'].values)

# Define column names (excluding 'Sample')
value_cols = [col for col in tw_bw_physio.columns if col != 'Sample']

# Generate random values with different ranges
col1 = np.random.uniform(0, 0.00002, size=(7,))   # TSS (% w/v)
col2 = np.random.uniform(0, 0.00002, size=(7,))   # %TS (w/w)
col3 = np.random.uniform(0, 0.02, size=(7,))     # Turbidity (NTU)

# Stack into DataFrame
new_df = pd.DataFrame(
    np.column_stack([col1, col2, col3]),
    columns=value_cols
)

# Add the Sample IDs as the first column
new_df = pd.concat(
    [clean_waves[['Group']].rename(columns={'Group':'Sample'}).head(7), new_df],
    axis=1
)

# Append to the original DataFrame
tw_bw_clean_physio = pd.concat([tw_bw_physio, new_df], ignore_index=True)

# Assume your DataFrame is called df with columns:
# ['Sample', 'TSS (% w/v)', '%TS (w/w)', 'Turbidity (NTU)']

# Segregating samples according to sample name [treated water (A1, B2, etc), clean (Blank1, Blank2), blackwater feed (BW1, BW2) etc)
def classify_sample(sample):
    if sample.startswith("Blank"):
        return "Clean Water"
    elif sample.startswith("BW"):
        return "Contaminated Water"
    else:
        return "Treated Water"

# Use classify sample
tw_bw_clean_physio['Type'] = tw_bw_clean_physio['Sample'].apply(classify_sample)

# Describe statistics in a separate value
stats = tw_bw_clean_physio.groupby('Type').describe()


grouped_stats = tw_bw_clean_physio.groupby('Type')[['TSS (% w/v)', '%TS (w/w)', 'Turbidity (NTU)']].agg(['mean','std'])

# Extract mean and std separately for plotting
means = grouped_stats.xs('mean', axis=1, level=1)
stds  = grouped_stats.xs('std', axis=1, level=1)

#tw_bw_clean_waves['Type'] = tw_bw_clean_waves['Sample'].apply(classify_sample)
wavelengths = pd.read_excel("Sample no Filter.xlsx", sheet_name="Samples", nrows=1, header = None)
wavelengths = wavelengths.drop(columns=[0])

tw_bw_clean_waves['Type'] = tw_bw_clean_waves['Group'].apply(classify_sample)

# --- Step 2: Extract wavelength values ---
wavelengths_values = wavelengths.values.flatten()  # drop first col if needed

# --- Step 1: Compute mean spectra and find maxima/minima indices ---
mean_spectra = {}
for sample_type, group in tw_bw_clean_waves.groupby('Type'):
    data = group.drop(columns=['Group','Type']).values
    mean_spectra[sample_type] = data.mean(axis=0)

# Find maxima/minima for each type
maxima_sets = []
minima_sets = []
for mean in mean_spectra.values():
    maxima_idx, _ = find_peaks(mean)
    minima_idx, _ = find_peaks(-mean)
    maxima_sets.append(maxima_idx)
    minima_sets.append(minima_idx)

# Function to find common peaks within tolerance
def find_common_peaks(sets, tolerance=5):
    common = []
    ref = sets[0]
    for idx in ref:
        if all(any(abs(idx - other) <= tolerance for other in s) for s in sets[1:]):
            common.append(idx)
    return np.array(common)
common_maxima = find_common_peaks(maxima_sets, tolerance=5)
common_minima = find_common_peaks(minima_sets, tolerance=5)

# --- Step 2: Extract wavelengths at these indices ---
selected_indices = np.sort(np.concatenate([common_maxima, common_minima]))
selected_wavelengths = wavelengths.iloc[0, selected_indices].values

# --- Step 3: Build new DataFrame ---
rows = []
# First row: wavelengths
header_row = ["Sample_ID"] + selected_wavelengths.tolist()
rows.append(header_row)

# Subsequent rows: sample ID + values at selected indices
for _, row in tw_bw_clean_waves.iterrows():
    sample_id = row['Group']
    values = row.drop(labels=['Group','Type']).values[selected_indices]
    rows.append([sample_id] + values.tolist())

# Convert to DataFrame
compeak_sum = pd.DataFrame(rows)

# --- Step 4: Optional formatting ---
compeak_sum.columns = ["Sample_ID"] + [f"Wavelength_{w}" for w in selected_wavelengths]
# Compute summary statistics for each sample type
summary_stats = []

for sample_type, group in tw_bw_clean_waves.groupby('Type'):
    data = group.drop(columns=['Group','Type']).values[:, selected_indices]
    
    stats = {
        "Sample_Type": sample_type,
        "N_samples": data.shape[0]
    }
    
    # For each wavelength, compute stats
    for i, wl in enumerate(selected_wavelengths):
        stats[f"{wl}_mean"]   = data[:, i].mean()
        stats[f"{wl}_std"]    = data[:, i].std()
        stats[f"{wl}_min"]    = data[:, i].min()
        stats[f"{wl}_max"]    = data[:, i].max()
        stats[f"{wl}_median"] = np.median(data[:, i])
    
    summary_stats.append(stats)

# --- Step 4: Convert to DataFrame ---
compeak_summary = pd.DataFrame(summary_stats)
# Ensure both DataFrames have Sample_ID as the first column
if compeak_sum.columns[0] != "Sample_ID":
    compeak_sum = compeak_sum.rename(columns={compeak_sum.columns[0]: "Sample_ID"})
if tw_bw_clean_physio.columns[0] != "Sample_ID":
    tw_bw_clean_physio = tw_bw_clean_physio.rename(columns={tw_bw_clean_physio.columns[0]: "Sample_ID"})

# --- Step 1: Merge on Sample_ID ---
merged_df = pd.merge(
    tw_bw_clean_physio,
    compeak_sum,
    on="Sample_ID",
    how="inner"   # only keep matching IDs
)

# --- Step 2: Reorder columns so tw_bw_clean_physio comes first ---
ordered_cols = ["Sample_ID"] + \
               [col for col in tw_bw_clean_physio.columns if col != "Sample_ID"] + \
               [col for col in compeak_sum.columns if col != "Sample_ID"]

waves_physio = merged_df[ordered_cols]

# --- Step 1: Ensure Sample_ID alignment ---
if compeak_sum.columns[0] != "Sample_ID":
    compeak_sum = compeak_sum.rename(columns={compeak_sum.columns[0]: "Sample_ID"})
if tw_bw_clean_physio.columns[0] != "Sample_ID":
    tw_bw_clean_physio = tw_bw_clean_physio.rename(columns={tw_bw_clean_physio.columns[0]: "Sample_ID"})

# --- Step 2: Merge absorbance + physiochemical data ---
# TSS (% w/v)     %TS (w/w)  Turbidity (NTU) 
merged_df = pd.merge(
    tw_bw_clean_physio[["Sample_ID","%TS (w/w)","TSS (% w/v)","Turbidity (NTU)"]],
    compeak_sum,
    on="Sample_ID",
    how="inner"
)

# --- Step 3: Compute correlation matrix ---
corr_matrix = merged_df.drop(columns=["Sample_ID"]).corr()

# --- Step 4: Extract correlations of TS, TSS, Turbidity vs wavelengths ---
physio_corr = corr_matrix.loc[["%TS (w/w)","TSS (% w/v)","Turbidity (NTU)"], compeak_sum.columns[1:]]
# --- Step 1: Keep only numeric columns ---
# Drop Sample_ID and any non-numeric columns like 'Type'
#X = waves_physio.select_dtypes(include=[np.number])

# --- Step 2: Standardize data ---
#X_scaled = StandardScaler().fit_transform(X)

# --- Step 3: Run PCA ---
#pca = PCA()
#X_pca = pca.fit_transform(X_scaled)

# --- Step 3: Create DataFrame with PCA results + sample type ---
#pca_df = pd.DataFrame({
#    "PC1": X_pca[:,0],
#    "PC2": X_pca[:,1],
#    "Type": waves_physio["Type"].values
#})

# Drop Sample_ID and any non-numeric columns like 'Type'
X = waves_physio.select_dtypes(include=[np.number])

# --- Step 2: Standardize data ---
X_scaled = StandardScaler().fit_transform(X)

# --- Step 3: Run PCA ---
pca = PCA(n_components=8)   # explicitly set number of components
X_pca = pca.fit_transform(X_scaled)

# --- Step 4: Create DataFrame with PCA results + sample type ---
# Keep ALL components for scree plot, but we‚Äôll only use PC1/PC2 for biplot
pca_df = pd.DataFrame(X_pca, columns=[f"PC{i+1}" for i in range(pca.n_components_)])
pca_df["Type"] = waves_physio["Type"].values

# --- Step 1: Keep only numeric predictors (drop Sample_ID and Type) ---
X = waves_physio.drop(columns=["Sample_ID","Type","%TS (w/w)","TSS (% w/v)","Turbidity (NTU)"])
y_tss = waves_physio["TSS (% w/v)"]
y_turb = waves_physio["Turbidity (NTU)"]

# --- Step 2: Standardize predictors ---
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# --- Step 3: Fit regression models ---
model_tss = LinearRegression().fit(X_scaled, y_tss)
model_turb = LinearRegression().fit(X_scaled, y_turb)

# --- Step 4: Output regression functions ---
print("TSS model intercept:", model_tss.intercept_)
print("TSS model coefficients:", dict(zip(X.columns, model_tss.coef_)))

print("\nTurbidity model intercept:", model_turb.intercept_)
print("Turbidity model coefficients:", dict(zip(X.columns, model_turb.coef_)))

# --- Step 5: Evaluate performance ---
y_tss_pred = model_tss.predict(X_scaled)
y_turb_pred = model_turb.predict(X_scaled)

# Predictions from your fitted models
y_tss_pred = model_tss.predict(X_scaled)
y_turb_pred = model_turb.predict(X_scaled)

# Metrics
tss_r2 = r2_score(y_tss, y_tss_pred)
tss_mse = mean_squared_error(y_tss, y_tss_pred)
tss_rmse = np.sqrt(tss_mse)

turb_r2 = r2_score(y_turb, y_turb_pred)
turb_mse = mean_squared_error(y_turb, y_turb_pred)
turb_rmse = np.sqrt(turb_mse)

# --- Step 1: Prepare predictors and targets ---
X = waves_physio.drop(columns=["Sample_ID","Type","%TS (w/w)","TSS (% w/v)","Turbidity (NTU)"])
y_tss = waves_physio["TSS (% w/v)"]
y_turb = waves_physio["Turbidity (NTU)"]

# --- Step 2: Standardize predictors ---
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# --- Step 3: Fit regression models ---
model_tss = LinearRegression().fit(X_scaled, y_tss)
model_turb = LinearRegression().fit(X_scaled, y_turb)

# --- Step 4: Function to print equation ---
def print_equation(model, X, target_name):
    intercept = model.intercept_
    coefs = model.coef_
    terms = [f"{coef:.3f}*{feature}" for coef, feature in zip(coefs, X.columns)]
    equation = f"{target_name} = {intercept:.3f} + " + " + ".join(terms)
    return equation


# Print equations
#print_equation(model_tss, X, "TSS (% w/v)")
#print_equation(model_turb, X, "Turbidity (NTU)")



# --- Step 5: Build comparison DataFrame ---
coef_df = pd.DataFrame({
    "Wavelength": X.columns,
    "TSS_Coefficient": model_tss.coef_,
    "Turbidity_Coefficient": model_turb.coef_
})

# Add absolute values for easier ranking
coef_df["|TSS|"] = coef_df["TSS_Coefficient"].abs()
coef_df["|Turbidity|"] = coef_df["Turbidity_Coefficient"].abs()

# Sort by strongest contributors for clarity
coef_df_sorted = coef_df.sort_values(by="|TSS|", ascending=False)

print("\nCoefficient Comparison Table:")
print(coef_df_sorted)

# --- Step 5: Build comparison DataFrame ---
coef_df = pd.DataFrame({
    "Wavelength": X.columns,
    "TSS_Coefficient": model_tss.coef_,
    "Turbidity_Coefficient": model_turb.coef_
})

# --- Step 1: Keep only numeric predictors ---
X = waves_physio.drop(columns=["Sample_ID","Type","%TS (w/w)","TSS (% w/v)","Turbidity (NTU)"])
y_tss = waves_physio["TSS (% w/v)"]
y_turb = waves_physio["Turbidity (NTU)"]

results = []

# --- Step 2: Loop through each predictor individually ---
for feature in X.columns:
    X_single = X[[feature]]  # one variable at a time
    
    # Fit regression for TSS
    model_tss = LinearRegression().fit(X_single, y_tss)
    y_tss_pred = model_tss.predict(X_single)
    tss_r2 = r2_score(y_tss, y_tss_pred)
    tss_rmse = np.sqrt(mean_squared_error(y_tss, y_tss_pred))
    
    # Fit regression for Turbidity
    model_turb = LinearRegression().fit(X_single, y_turb)
    y_turb_pred = model_turb.predict(X_single)
    turb_r2 = r2_score(y_turb, y_turb_pred)
    turb_rmse = np.sqrt(mean_squared_error(y_turb, y_turb_pred))
    
    results.append({
        "Feature": feature,
        "TSS R¬≤": round(tss_r2, 3),
        "TSS RMSE": round(tss_rmse, 3),
        "Turbidity R¬≤": round(turb_r2, 3),
        "Turbidity RMSE": round(turb_rmse, 3)
    })

# --- Step 3: Convert results to DataFrame for inspection ---
decomp_results = pd.DataFrame(results)

fig_tss = px.bar(decomp_results, x="Feature", y="TSS R¬≤", title="TSS R¬≤ by Wavelength", color="TSS R¬≤")
fig_turb = px.bar(decomp_results, x="Feature", y="Turbidity R¬≤", title="Turbidity R¬≤ by Wavelength", color="Turbidity R¬≤")

# Top wavelengths
top_wavelengths = [
    "Wavelength_951.46",
    "Wavelength_957.655",
    "Wavelength_970.044",
    "Wavelength_976.238",
    "Wavelength_1186.846"
]

X_top = waves_physio[top_wavelengths]

# Full spectrum (drop non-numeric identifiers and targets)
X_full = waves_physio.drop(columns=[
    "Sample_ID","Type","%TS (w/w)","TSS (% w/v)","Turbidity (NTU)"
])

# Targets
y_tss = waves_physio["TSS (% w/v)"]
y_turb = waves_physio["Turbidity (NTU)"]

#Scaling
scaler = StandardScaler()

X_top_scaled = scaler.fit_transform(X_top)
X_full_scaled = scaler.fit_transform(X_full)

#Fitting Linear Regression Models
# Top wavelengths
model_tss_top = LinearRegression().fit(X_top_scaled, y_tss)
model_turb_top = LinearRegression().fit(X_top_scaled, y_turb)

# Full spectrum
model_tss_full = LinearRegression().fit(X_full_scaled, y_tss)
model_turb_full = LinearRegression().fit(X_full_scaled, y_turb)

#evaluating fit
def evaluate(model, X, y):
    y_pred = model.predict(X)
    r2 = r2_score(y, y_pred)
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    return r2, rmse

# Top wavelengths
top_wavelengths = [
    "Wavelength_951.46",
    "Wavelength_957.655",
    "Wavelength_970.044",
    "Wavelength_976.238",
    "Wavelength_1186.846"
]

X_top = waves_physio[top_wavelengths]

# Full spectrum (drop non-numeric identifiers and targets)
X_full = waves_physio.drop(columns=[
    "Sample_ID","Type","%TS (w/w)","TSS (% w/v)","Turbidity (NTU)"
])

# Targets
y_tss = waves_physio["TSS (% w/v)"]
y_turb = waves_physio["Turbidity (NTU)"]

#Scaling
scaler = StandardScaler()

X_top_scaled = scaler.fit_transform(X_top)
X_full_scaled = scaler.fit_transform(X_full)

#Fitting Linear Regression Models
# Top wavelengths
model_tss_top = LinearRegression().fit(X_top_scaled, y_tss)
model_turb_top = LinearRegression().fit(X_top_scaled, y_turb)

# Full spectrum
model_tss_full = LinearRegression().fit(X_full_scaled, y_tss)
model_turb_full = LinearRegression().fit(X_full_scaled, y_turb)

#evaluating fit
def evaluate(model, X, y):
    y_pred = model.predict(X)
    r2 = r2_score(y, y_pred)
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    return r2, rmse

tss_r2_top, tss_rmse_top = evaluate(model_tss_top, X_top_scaled, y_tss)
tss_r2_full, tss_rmse_full = evaluate(model_tss_full, X_full_scaled, y_tss)

turb_r2_top, turb_rmse_top = evaluate(model_turb_top, X_top_scaled, y_turb)
turb_r2_full, turb_rmse_full = evaluate(model_turb_full, X_full_scaled, y_turb)

#visualizing coefficients
# TSS coefficients
coef_tss = dict(zip(top_wavelengths, model_tss_top.coef_))
coef_turb = dict(zip(top_wavelengths, model_turb_top.coef_))



# -------------------------
# Page 1: Project Overview
# -------------------------
def page1():
    st.title("üåäüíßüß™ Absorbance ‚Üí TSS & Turbidity Project üö∞‚öóÔ∏è")

    st.markdown("""
    ### üåç Why This Project?
    Water quality is a critical issue worldwide üåä. Suspended solids and turbidity directly affect ecosystems üêü,
    drinking water safety üö∞, and wastewater treatment efficiency üè≠. By using absorbance data at specific wavelengths,
    we can build predictive models that help monitor and manage water quality more effectively üíß.

    ### üî¨ Scientific Motivation
    Spectroscopy provides a rapid, non-destructive way to analyze water samples üß™. Instead of relying solely on
    traditional lab methods ‚öóÔ∏è, absorbance readings can be transformed into meaningful predictions of TSS and Turbidity.
    This bridges environmental science üå± with data science üìä, creating scalable solutions for water monitoring.

    ### üöÄ Personal Goal
    My aim is to demonstrate how reproducible modeling workflows can connect raw spectral data üìà with real-world
    water quality outcomes üåä. This project is both a scientific exploration üî¨ and a practical tool for
    wastewater management üè≠ and clean water initiatives üíß.
    """)

# -------------------------
# Page 2: IDA
# -------------------------
def page2():
    st.title("IDA")

    option = st.selectbox(
        "Select what to view:",
        [
            "Option 1: Data collection and importation",
            "Option 2: Data cleaning and preprocessing",
            "Option 3: Basic descriptive statistics",
            "Option 4: Missing data analysis"
        ]
    )

    if option == "Option 1: Data collection and importation":
        st.subheader("üì• Data Collection and Importation")
        st.write("Placeholder text: Describe how absorbance data was collected.")
        st.dataframe(tw_bw_clean_waves.head())

    elif option == "Option 2: Data cleaning and preprocessing":
        st.subheader("üßπ Data Cleaning and Preprocessing")
        st.write("Placeholder text: Describe splitting between blackwater, treated water, and clean water.")
        st.dataframe(tw_bw_clean_physio.head())

    elif option == "Option 3: Basic descriptive statistics":
        st.subheader("üìä Basic Descriptive Statistics")
        st.dataframe(stats)

    elif option == "Option 4: Missing data analysis":
        st.subheader("‚ùì Missing Data Analysis")
        merged_turbtssts = pd.concat([turb, tss, ts], axis=1)
        fig, ax = plt.subplots(figsize=(12,18))  # larger heatmap
        sns.heatmap(merged_turbtssts.isna(), cmap="rocket", ax=ax)
        st.pyplot(fig)

# -------------------------
# Page 3: EDA
# -------------------------
def page3():
    st.title("EDA: Exploratory Data Analysis")
    st.markdown(
    "<h3 style='color:#C71585; font-weight:bold;'>Click the expand button in the top right corner of the figure for a zoomed in view! - üîç</h3>",
    unsafe_allow_html=True
)

    option = st.selectbox(
        "Select what to view:",
        [
            "Option 1: Visualizing means of physiochemical characteristics vs sample type",
            "Option 2: Visualizing spectral fingerprints according to sample types",
            "Option 3: Separate sample types and their absorbance plots-Interactive!",
            "Option 4: Common peaks and valleys",
            "Option 5: Summary statistics for common peaks and valleys",
            "Option 6: Correlation? You decide"
        ]
    )

    if option == "Option 1: Visualizing means of physiochemical characteristics vs sample type":
        colors = ['#1f77b4', '#ff7f0e', '#2ca02c']
        fig, ax = plt.subplots(figsize=(14,10))
        means.plot(kind='bar', yerr=stds, capsize=4, color=colors, ax=ax)
        ax.set_title("Mean values by Sample Type")
        ax.set_yscale('log')
        ax.set_ylabel("Mean Value (log)")
        ax.legend(title="Parameter")
        st.pyplot(fig)

    elif option == "Option 2: Visualizing spectral fingerprints according to sample types":
        colors = {"Clean Water":"blue","Contaminated Water":"red","Treated Water":"green"}
        fig, ax = plt.subplots(figsize=(16,12))
        sns.set_style("whitegrid")
        for sample_type, group in tw_bw_clean_waves.groupby('Type'):
            data = group.drop(columns=['Group','Type']).values
            for row in data:
                ax.plot(wavelengths_values, row, color=colors[sample_type], alpha=0.2)
            mean = data.mean(axis=0); std = data.std(axis=0)
            ax.plot(wavelengths_values, mean, color=colors[sample_type], linewidth=2, label=f"{sample_type} mean")
            ax.fill_between(wavelengths_values, mean-std, mean+std, color=colors[sample_type], alpha=0.2)
        ax.set_title("Spectra by Sample Type")
        ax.set_xlabel("Wavelength (nm)"); ax.set_ylabel("Absorbance (AU)")
        ax.legend()
        st.pyplot(fig)

    elif option == "Option 3: Separate sample types and their absorbance plots-Interactive!":
        colors = {"Clean Water":"blue","Contaminated Water":"red","Treated Water":"green"}
        traces, trace_groups = [], {}
        for sample_type, group in tw_bw_clean_waves.groupby('Type'):
            data = group.drop(columns=['Group','Type']).values
            mean = data.mean(axis=0); std = data.std(axis=0)
            trace_groups[sample_type] = []
            for row in data:
                idx = len(traces)
                traces.append(go.Scatter(x=wavelengths_values,y=row,mode='lines',
                                         line=dict(color=colors[sample_type],width=1),
                                         opacity=0.2,visible=(sample_type=="Clean Water"),
                                         name=f"{sample_type} sample"))
                trace_groups[sample_type].append(idx)
            idx = len(traces)
            traces.append(go.Scatter(x=wavelengths_values,y=mean,mode='lines',
                                     line=dict(color=colors[sample_type],width=3),
                                     visible=(sample_type=="Clean Water"),
                                     name=f"{sample_type} mean"))
            trace_groups[sample_type].append(idx)
        buttons = []
        for sample_type in trace_groups:
            visible_mask = [False]*len(traces)
            for idx in trace_groups[sample_type]: visible_mask[idx]=True
            buttons.append(dict(label=sample_type,method="update",
                                args=[{"visible":visible_mask},{"title":f"{sample_type} Spectra"}]))
        fig = go.Figure(data=traces)
        fig.update_layout(title="Spectra by Sample Type",xaxis_title="Wavelength (nm)",
                          yaxis_title="Absorbance (AU)",template="plotly_white",
                          height=900,width=1200,updatemenus=[dict(active=0,buttons=buttons,x=1,y=1.15,
                          xanchor="right",yanchor="top")])
        st.plotly_chart(fig)

    elif option == "Option 4: Common peaks and valleys":
        st.write("Placeholder text: Why identifying common peaks and valleys is important.")
        colors = {"Clean Water":"blue","Contaminated Water":"red","Treated Water":"green"}
        mean_spectra,std_spectra={},{}
        for sample_type, group in tw_bw_clean_waves.groupby('Type'):
            data = group.drop(columns=['Group','Type']).values
            mean_spectra[sample_type]=data.mean(axis=0); std_spectra[sample_type]=data.std(axis=0)
        maxima_sets,minima_sets=[],[]
        for sample_type, mean in mean_spectra.items():
            maxima_idx,_=find_peaks(mean); minima_idx,_=find_peaks(-mean)
            maxima_sets.append(set(maxima_idx)); minima_sets.append(set(minima_idx))
        def find_common_peaks(sets,tolerance=5):
            common=[]; ref=sets[0]
            for idx in ref:
                if all(any(abs(idx-other)<=tolerance for other in s) for s in sets[1:]): common.append(idx)
            return np.array(common)
        common_maxima=find_common_peaks(maxima_sets); common_minima=find_common_peaks(minima_sets)
        fig,ax=plt.subplots(figsize=(16,12))
        for sample_type in mean_spectra:
            mean=mean_spectra[sample_type]; std=std_spectra[sample_type]
            ax.plot(wavelengths_values,mean,color=colors[sample_type],linewidth=2,label=f"{sample_type} mean")
            ax.fill_between(wavelengths_values,mean-std,mean+std,color=colors[sample_type],alpha=0.2)

        
        # Continue Option 4 plotting
        ax.scatter(wavelengths_values[list(common_maxima)],
                   [mean_spectra[t][list(common_maxima)] for t in mean_spectra][0],
                   marker='x', s=120, color='black', linewidths=3,
                   label="Common maxima")

        ax.scatter(wavelengths_values[list(common_minima)],
                   [mean_spectra[t][list(common_minima)] for t in mean_spectra][0],
                   marker='x', s=120, color='purple', linewidths=3,
                   label="Common minima")

        ax.set_title("Spectra with Common Maxima and Minima", fontsize=16, fontweight='bold')
        ax.set_xlabel("Wavelength (nm)", fontsize=14)
        ax.set_ylabel("Absorbance (AU)", fontsize=14)
        ax.legend(fontsize=12, loc="upper right")
        st.pyplot(fig)

    # --- Option 5 ---
    elif option == "Option 5: Summary statistics for common peaks and valleys":
        st.subheader("üìë Summary Statistics for Common Peaks and Valleys")
        st.write("Below is the summary DataFrame of common peaks and valleys across sample types.")
        st.dataframe(compeak_summary)

    # --- Option 6 ---
    elif option == "Option 6: Correlation? You decide":
        st.subheader("üîó Correlation Analysis")
        st.write("Heatmap showing correlation of absorbance at common wavelengths with TS, TSS, and Turbidity.")

        fig, ax = plt.subplots(figsize=(18,12))  # larger heatmap
        sns.heatmap(
            physio_corr,
            cmap="coolwarm",
            annot=True,
            fmt=".2f",
            cbar_kws={'label': 'Correlation coefficient'},
            ax=ax
        )
        ax.set_title("Correlation of Absorbance at Common Wavelengths with TS, TSS, and Turbidity",
                     fontsize=18, fontweight="bold")
        ax.set_xlabel("Wavelengths", fontsize=14)
        ax.set_ylabel("Physiochemical Parameters", fontsize=14)
        st.pyplot(fig)

def page4():
    st.title("Imputing Missing Values üß©")

    # Placeholder explanatory text
    st.markdown(
        "<p style='color:black; font-size:16px;'>"
        "Placeholder: Describe the imputation technique used (e.g., KNN, mean, median) and why it was chosen."
        "</p>",
        unsafe_allow_html=True
    )

    # --- Compare Original vs Imputed TSS ---
    fig_tss = plt.figure(figsize=(8, 4))
    sns.histplot(tss_not_missing.dropna(), kde=True, color='blue', alpha=0.5, label='Original (non-missing)')
    sns.histplot(tss_imputed['TSS (% w/v)'], kde=True, color='red', alpha=0.5, label='Imputed')
    plt.title('Distribution of Original vs Imputed TSS')
    plt.legend()
    st.pyplot(fig_tss)

    # --- Compare Original vs Imputed Turbidity ---
    fig_turb = plt.figure(figsize=(8, 4))
    sns.histplot(turb_not_missing.dropna(), kde=True, color='blue', alpha=0.5, label='Original (non-missing)')
    sns.histplot(turb_imputed['Turbidity (NTU)'], kde=True, color='red', alpha=0.5, label='Imputed')
    plt.title('Distribution of Original vs Imputed Turbidity (NTU)')
    plt.legend()
    st.pyplot(fig_turb)

def page5():
    st.title("Principal Component Analysis ‚Äì Are my samples discernibly different? üîç")

    # Toggle between PCA biplot and Scree plot
    view_option = st.radio(
        "Select plot to display:",
        ["PCA Biplot", "Scree Plot"],
        horizontal=True
    )

    # --- PCA Biplot ---
    if view_option == "PCA Biplot":
        st.markdown(
            "<p style='color:black; font-size:18px; font-weight:bold;'>"
            "Placeholder: Explain PCA separation of sample types and how loadings arrows show feature contributions."
            "</p>",
            unsafe_allow_html=True
        )

        fig = go.Figure()

        # Color-coded PCA (red, green, blue)
        fig.add_trace(go.Scatter(
            x=pca_df["PC1"], y=pca_df["PC2"],
            mode="markers",
            marker=dict(size=12),
            text=pca_df["Type"],
            name="Color-coded PCA",
            marker_color=pca_df["Type"].map({
                "Clean Water": "blue",
                "Dirty Water": "red",
                "Treated Water": "green"
            })
        ))

        # Uncoded PCA (all gray)
        fig.add_trace(go.Scatter(
            x=pca_df["PC1"], y=pca_df["PC2"],
            mode="markers",
            marker=dict(size=12, color="gray"),
            text=pca_df["Type"],
            name="Uncoded PCA"
        ))

        # Variance directional arrows (loadings for PC1 and PC2 only)
        loadings = pca.components_.T[:, :2]
        for i, feature in enumerate(X.columns):
            fig.add_trace(go.Scatter(
                x=[0, loadings[i, 0] * 3],
                y=[0, loadings[i, 1] * 3],
                mode="lines+text",
                line=dict(color="black"),
                text=[None, feature],
                textposition="top center",
                showlegend=False
            ))

        # Dropdown menu at top middle
        fig.update_layout(
            updatemenus=[
                dict(
                    buttons=list([
                        dict(label="Color-coded PCA",
                             method="update",
                             args=[{"visible": [True, False] + [True] * len(X.columns)},
                                   {"title": "PCA Biplot (Color-coded by Sample Type)"}]),
                        dict(label="Uncoded PCA",
                             method="update",
                             args=[{"visible": [False, True] + [True] * len(X.columns)},
                                   {"title": "PCA Biplot (Uncoded)"}])
                    ]),
                    direction="down",
                    showactive=True,
                    x=0.5, y=1.15,   # üîë top middle
                    xanchor="center", yanchor="top"
                )
            ],
            title="PCA Biplot",
            xaxis_title=f"PC1 ({pca.explained_variance_ratio_[0]*100:.1f}% variance)",
            yaxis_title=f"PC2 ({pca.explained_variance_ratio_[1]*100:.1f}% variance)",
            height=1000, width=1200,
            plot_bgcolor="white", paper_bgcolor="white"
        )

        st.plotly_chart(fig, use_container_width=True)

    # --- Scree Plot ---

    elif view_option == "Scree Plot":
        st.markdown(
            "<p style='color:black; font-size:18px; font-weight:bold;'>"
            "Placeholder: Explain how much variance each principal component explains (scree plot)."
            "</p>",
            unsafe_allow_html=True
        )
    
        # Scree plot of explained variance (scatter + line)
        fig = go.Figure()
        fig.add_trace(go.Scatter(
            x=[f"PC{i+1}" for i in range(pca.n_components_)],
            y=pca.explained_variance_ratio_ * 100,
            mode="lines+markers",   # üîë scatter with lines
            marker=dict(size=10, color="purple"),
            line=dict(color="purple", width=2),
            name="Variance Explained"
        ))
    
        fig.update_layout(
            title="Scree Plot ‚Äì Variance Explained by Principal Components",
            xaxis_title="Principal Components",
            yaxis_title="Variance Explained (%)",
            height=900, width=1200,
            plot_bgcolor="white", paper_bgcolor="white"
        )
    
        st.plotly_chart(fig, use_container_width=True)
def page6():
    st.title("Linear Regression Using All Common Wavelengths üìà")

    # --- Define predictors for full spectrum ---
    X_full_lr = waves_physio.drop(columns=[
        "Sample_ID","Type","%TS (w/w)","TSS (% w/v)","Turbidity (NTU)"
    ])
    y_tss_lr = waves_physio["TSS (% w/v)"]
    y_turb_lr = waves_physio["Turbidity (NTU)"]

    # --- Standardize predictors ---
    scaler_lr = StandardScaler()
    X_scaled_lr = scaler_lr.fit_transform(X_full_lr)

    # --- Fit regression models ---
    model_tss_lr = LinearRegression().fit(X_scaled_lr, y_tss_lr)
    model_turb_lr = LinearRegression().fit(X_scaled_lr, y_turb_lr)

    # --- Predictions ---
    y_tss_pred_lr = model_tss_lr.predict(X_scaled_lr)
    y_turb_pred_lr = model_turb_lr.predict(X_scaled_lr)

    # --- Metrics ---
    tss_r2_lr = r2_score(y_tss_lr, y_tss_pred_lr)
    tss_rmse_lr = np.sqrt(mean_squared_error(y_tss_lr, y_tss_pred_lr))
    turb_r2_lr = r2_score(y_turb_lr, y_turb_pred_lr)
    turb_rmse_lr = np.sqrt(mean_squared_error(y_turb_lr, y_turb_pred_lr))

    st.subheader("Model Performance Metrics")
    st.write(f"TSS R¬≤:   {tss_r2_lr:.3f}")
    st.write(f"TSS RMSE: {tss_rmse_lr:.3f}")
    st.write(f"Turbidity R¬≤:   {turb_r2_lr:.3f}")
    st.write(f"Turbidity RMSE: {turb_rmse_lr:.3f}")

    # --- Predicted vs Actual Plots ---
    fig, axes = plt.subplots(1, 2, figsize=(10,4))
    axes[0].scatter(y_tss_lr, y_tss_pred_lr, color="steelblue", alpha=0.7)
    axes[0].plot([y_tss_lr.min(), y_tss_lr.max()], [y_tss_lr.min(), y_tss_lr.max()], 'k--', lw=1)
    axes[0].set_title("TSS: Predicted vs Actual")
    axes[0].set_xlabel("Actual TSS (% w/v)")
    axes[0].set_ylabel("Predicted TSS")

    axes[1].scatter(y_turb_lr, y_turb_pred_lr, color="darkorange", alpha=0.7)
    axes[1].plot([y_turb_lr.min(), y_turb_lr.max()], [y_turb_lr.min(), y_turb_lr.max()], 'k--', lw=1)
    axes[1].set_title("Turbidity: Predicted vs Actual")
    axes[1].set_xlabel("Actual Turbidity (NTU)")
    axes[1].set_ylabel("Predicted Turbidity")

    st.pyplot(fig)

    # --- Regression Equations ---
    eq_tss_lr = print_equation(model_tss_lr, X_full_lr, "TSS (% w/v)")
    eq_turb_lr = print_equation(model_turb_lr, X_full_lr, "Turbidity (NTU)")
    st.subheader("Regression Equations")
    st.markdown(f"**TSS Equation:** {eq_tss_lr}")
    st.markdown(f"**Turbidity Equation:** {eq_turb_lr}")

    # --- Coefficient Plots ---
    coef_df_lr = pd.DataFrame({
        "Wavelength": X_full_lr.columns,
        "TSS_Coefficient": model_tss_lr.coef_,
        "Turbidity_Coefficient": model_turb_lr.coef_
    })

    fig1, ax1 = plt.subplots(figsize=(10,6))
    ax1.bar(coef_df_lr["Wavelength"], coef_df_lr["TSS_Coefficient"], color="steelblue")
    ax1.set_title("TSS Regression Coefficients")
    ax1.set_ylabel("Coefficient Value")
    ax1.set_xticklabels(coef_df_lr["Wavelength"], rotation=90)
    st.pyplot(fig1)

    fig2, ax2 = plt.subplots(figsize=(10,6))
    ax2.bar(coef_df_lr["Wavelength"], coef_df_lr["Turbidity_Coefficient"], color="darkorange")
    ax2.set_title("Turbidity Regression Coefficients")
    ax2.set_ylabel("Coefficient Value")
    ax2.set_xticklabels(coef_df_lr["Wavelength"], rotation=90)
    st.pyplot(fig2)

def page7():
    st.title("Multivariate Regression Comparison üìä")

    # --- Introductory blurb ---
    st.markdown(
        """
        ### Why Multivariate Regression?
        In this analysis we focused on absorbance values at five wavelengths: **951.46 nm, 957.655 nm, 970.044 nm,
        976.238 nm, and 1186.846 nm**. These were selected because single‚Äëvariable decomposition showed they consistently
        explained the most variance in both TSS and Turbidity, particularly the cluster in the 950‚Äì976 nm region.

        While individual wavelengths can provide useful signals, water quality is influenced by overlapping spectral
        features. **Multivariate regression allows us to combine multiple predictors simultaneously**, capturing
        complementary information and reducing redundancy. By comparing performance against the full spectrum model,
        we can evaluate whether a reduced set of wavelengths is sufficient for accurate prediction, balancing
        interpretability with predictive power.
        """,
        unsafe_allow_html=True
    )

    # --- Define predictors ---
    top_wavelengths = [
        "Wavelength_951.46","Wavelength_957.655",
        "Wavelength_970.044","Wavelength_976.238",
        "Wavelength_1186.846"
    ]
    X_top_comp = waves_physio[top_wavelengths]
    X_full_comp = waves_physio.drop(columns=[
        "Sample_ID","Type","%TS (w/w)","TSS (% w/v)","Turbidity (NTU)"
    ])
    y_tss_comp = waves_physio["TSS (% w/v)"]
    y_turb_comp = waves_physio["Turbidity (NTU)"]

    # --- Standardize predictors ---
    scaler_comp = StandardScaler()
    X_top_scaled = scaler_comp.fit_transform(X_top_comp)
    X_full_scaled = scaler_comp.fit_transform(X_full_comp)

    # --- Fit models ---
    model_tss_top = LinearRegression().fit(X_top_scaled, y_tss_comp)
    model_turb_top = LinearRegression().fit(X_top_scaled, y_turb_comp)
    model_tss_full = LinearRegression().fit(X_full_scaled, y_tss_comp)
    model_turb_full = LinearRegression().fit(X_full_scaled, y_turb_comp)

    # --- Evaluate performance ---
    def evaluate(model, X, y):
        y_pred = model.predict(X)
        r2 = r2_score(y, y_pred)
        rmse = np.sqrt(mean_squared_error(y, y_pred))
        return r2, rmse

    tss_r2_top, tss_rmse_top = evaluate(model_tss_top, X_top_scaled, y_tss_comp)
    tss_r2_full, tss_rmse_full = evaluate(model_tss_full, X_full_scaled, y_tss_comp)
    turb_r2_top, turb_rmse_top = evaluate(model_turb_top, X_top_scaled, y_turb_comp)
    turb_r2_full, turb_rmse_full = evaluate(model_turb_full, X_full_scaled, y_turb_comp)
   # --- Display metrics ---
    st.subheader("Single Value Decomposition Results")
    st.plotly_chart(fig_tss, use_container_width=True)
    st.plotly_chart(fig_turb, use_container_width=True)
    
    # --- Display metrics ---
    st.subheader("Model Performance")
    st.markdown(f"**TSS (Top wavelengths):** R¬≤ = {tss_r2_top:.3f}, RMSE = {tss_rmse_top:.3f}")
    st.markdown(f"**TSS (Full spectrum):** R¬≤ = {tss_r2_full:.3f}, RMSE = {tss_rmse_full:.3f}")
    st.markdown(f"**Turbidity (Top wavelengths):** R¬≤ = {turb_r2_top:.3f}, RMSE = {turb_rmse_top:.3f}")
    st.markdown(f"**Turbidity (Full spectrum):** R¬≤ = {turb_r2_full:.3f}, RMSE = {turb_rmse_full:.3f}")

    # --- Observations blurb with emojis ---
    st.markdown(
        """
        ### ‚úÖ Observations

        üåä **Strongest performers for Turbidity (highest R¬≤):**
        - 951.46 nm ‚Üí Turbidity R¬≤ = 0.492  
        - 957.655 nm ‚Üí Turbidity R¬≤ = 0.491  
        - 970.044 nm ‚Üí Turbidity R¬≤ = 0.488  
        - These three are clustered in the **950‚Äì970 nm region**, suggesting a spectral band that is particularly informative for turbidity.

        üìä **Strongest performers for TSS (highest R¬≤):**
        - 957.655 nm ‚Üí TSS R¬≤ = 0.385  
        - 951.46 nm ‚Üí TSS R¬≤ = 0.384  
        - 970.044 nm ‚Üí TSS R¬≤ = 0.383  
        - Again, the same **950‚Äì970 nm region** dominates.

        ‚ö†Ô∏è **Weaker predictors:**
        - 1267.373 nm and 1453.203 nm have noticeably lower R¬≤ values (especially for turbidity, 0.440 and 0.341).  
        - These wavelengths add less explanatory power individually.
        """,
        unsafe_allow_html=True
    )

    # --- Visualize coefficients for Top wavelengths ---
    st.subheader("Feature Importance (Top Wavelengths)")
    coef_tss = dict(zip(top_wavelengths, model_tss_top.coef_))
    coef_turb = dict(zip(top_wavelengths, model_turb_top.coef_))

    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    sns.barplot(x=list(coef_tss.keys()), y=list(coef_tss.values()), ax=axes[0])
    axes[0].set_title("TSS Coefficients")
    axes[0].tick_params(axis='x', rotation=45)

    sns.barplot(x=list(coef_turb.keys()), y=list(coef_turb.values()), ax=axes[1])
    axes[1].set_title("Turbidity Coefficients")
    axes[1].tick_params(axis='x', rotation=45)

    st.pyplot(fig)

# Page 8: Prediction Tool
def page8():
    st.title("Predict TSS and Turbidity from Absorbance üåä")

    # Placeholder explanatory text
    st.markdown(
        """
        <p style='color:black; font-size:18px; font-weight:bold;'>
            Because we found that the <span style='color:darkorange; font-weight:bold;'>full spectrum model</span> 
            yielded a higher R¬≤ value, we will use all the common wavelength absorbances to predict TSS and Turbidity.
        </p>
        """,
        unsafe_allow_html=True
    )


    # --- Option 1: Manual Entry ---
    st.subheader("Manual Entry of Absorbance Values")
    user_inputs = {}
    for feature in X.columns:
        user_inputs[feature] = st.number_input(
            f"Absorbance at {feature}",
            value=0.0,
            format="%.3f"
        )

    if st.button("Predict from Manual Entry"):
        input_df = pd.DataFrame([user_inputs])
        input_scaled = scaler.transform(input_df)
        tss_pred = model_tss.predict(input_scaled)[0]
        turb_pred = model_turb.predict(input_scaled)[0]

        st.write(f"**Predicted TSS (% w/v):** {tss_pred:.3f}")
        st.write(f"**Predicted Turbidity (NTU):** {turb_pred:.3f}")

    # --- Option 2: Upload Excel File ---
    st.subheader("Upload Excel File with Absorbance Values")
    uploaded_file = st.file_uploader("Upload Excel file", type=["xlsx", "xls"])

    if uploaded_file is not None:
        try:
            df_uploaded = pd.read_excel(uploaded_file)
            st.write("Preview of uploaded data:")
            st.dataframe(df_uploaded.head())

            # Ensure only predictor columns are used
            df_predictors = df_uploaded[X.columns]

            # Scale inputs
            input_scaled = scaler.transform(df_predictors)

            # Predict TSS and Turbidity
            tss_preds = model_tss.predict(input_scaled)
            turb_preds = model_turb.predict(input_scaled)

            # Combine results
            results_df = df_uploaded.copy()
            results_df["Predicted TSS (% w/v)"] = tss_preds
            results_df["Predicted Turbidity (NTU)"] = turb_preds

            st.subheader("Predicted Results")
            st.dataframe(results_df)

        except Exception as e:
            st.error(f"Error processing file: {e}")

# -------------------------
# Main App Navigation
# -------------------------
st.sidebar.title("Navigation")

section = st.sidebar.radio("Choose section:", ["üìö Project Overview", "‚öôÔ∏è App Tool"])

if section == "üìö Project Overview":
    page = st.sidebar.selectbox(
        "Select a page:",
        [
            "Project Overview",
            "IDA: Initial Data Analysis",
            "EDA: Exploratory Data Analysis",
            "Imputing Missing Values",
            "PCA ‚Äì Are my samples discernibly different?",
            "Linear Regression Using All Common Wavelengths",
            "Multivariate Regression Comparison"
        ]
    )

    if page == "Project Overview":
        page1()
    elif page == "IDA: Initial Data Analysis":
        page2()
    elif page == "EDA: Exploratory Data Analysis":
        page3()
    elif page == "Imputing Missing Values":
        page4()
    elif page == "PCA ‚Äì Are my samples discernibly different?":
        page5()
    elif page == "Linear Regression Using All Common Wavelengths":
        page6()
    elif page == "Multivariate Regression Comparison":
        page7()


elif section == "‚öôÔ∏è App Tool":
    page8()

Overwriting FinalProj.py


In [54]:
import pkg_resources

packages = ["streamlit","pandas","numpy","matplotlib","seaborn",
            "scikit-learn","openpyxl","scipy","plotly"]

for pkg in packages:
    print(f"{pkg}=={pkg_resources.get_distribution(pkg).version}")



streamlit==1.45.1
pandas==2.2.3
numpy==2.1.3
matplotlib==3.10.0
seaborn==0.13.2
scikit-learn==1.6.1
openpyxl==3.1.5
scipy==1.15.3
plotly==5.24.1
