In [28]:
pip install lime streamlit pyngrok prophet



In [51]:
!ngrok config add-authtoken 2pE6lGKiIXg5XMOblVighh51SNN_5At8H6Uqiyj176iMqUWtV

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [52]:
%%writefile app.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import streamlit as st
import shap
import pickle
from lime.lime_tabular import LimeTabularExplainer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import (
    mean_squared_error, mean_absolute_error, r2_score,
    confusion_matrix, roc_curve, auc, precision_recall_curve
)
from sklearn.preprocessing import StandardScaler, LabelEncoder
from statsmodels.graphics.gofplots import qqplot
from statsmodels.tsa.arima.model import ARIMA
from prophet import Prophet

# Streamlit Title
st.title("Interactive Machine Learning Workflow Dashboard")

# Sidebar for File Upload
st.sidebar.title("Upload Your Dataset")
uploaded_file = st.sidebar.file_uploader("Upload a CSV file", type=["csv"])
st.sidebar.markdown("---")

# Load Dataset
if uploaded_file is not None:
    df = pd.read_csv(uploaded_file, encoding="latin1")
    st.sidebar.success("File uploaded successfully!")
else:
    st.sidebar.info("Using default dataset: House_Listings.csv")
    df = pd.read_csv("House_Listings.csv", encoding="latin1")

# Simulate a Date Column if it doesn't exist
if "Date" not in df.columns:
    df["Date"] = pd.date_range(start="2023-01-01", periods=len(df), freq="D")
    st.warning("No time series column found. Simulating a 'Date' column starting from 2023-01-01.")

# Allow sampling for large datasets
if df.shape[0] > 10000:
    st.warning("Dataset is large! Using a random sample of 10,000 rows for visualization.")
    df = df.sample(10000, random_state=42)

# Display Dataset
st.subheader("Dataset Overview")
st.write(df.head())

# --- Step 1: EDA ---
st.subheader("Exploratory Data Analysis (EDA)")

# Missing Data Visualization
st.write("**Missing Data Heatmap**")
fig, ax = plt.subplots(figsize=(12, 8))
sns.heatmap(df.isnull(), cbar=False, cmap="viridis", ax=ax)
st.pyplot(fig)

# Correlation Matrix
st.write("**Correlation Matrix**")
numerical_cols = df.select_dtypes(include=np.number).columns
if "Date" in numerical_cols:
    numerical_cols = numerical_cols.drop("Date", errors="ignore")  # Exclude 'Date'
fig, ax = plt.subplots(figsize=(12, 8))
sns.heatmap(df[numerical_cols].corr(), annot=True, cmap="coolwarm", ax=ax)
ax.set_title("Correlation Matrix")
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, horizontalalignment='right')
st.pyplot(fig)

# --- Step 2: Preprocessing ---
st.subheader("Data Preprocessing")
categorical_cols = df.select_dtypes(include=['object']).columns

# Encoding Effects Visualization
if st.checkbox("Show Encoding Effects"):
    for col in categorical_cols:
        st.write(f"**Encoding Effects for {col}**")
        if df[col].nunique() > 20:
            st.write(f"Too many unique values in **{col}**. Showing only the top 20 categories.")
            subset = df[col].value_counts().nlargest(20).index
            subset_df = df[df[col].isin(subset)]
        else:
            subset_df = df

        fig, ax = plt.subplots(figsize=(12, 8))
        sns.countplot(x=col, data=subset_df, ax=ax, order=subset_df[col].value_counts().index)
        ax.set_title(f"Original Distribution of {col}")
        ax.set_xticklabels(ax.get_xticklabels(), rotation=45, horizontalalignment='right')
        st.pyplot(fig)

        # Encoding
        encoder = LabelEncoder()
        df[col] = encoder.fit_transform(df[col])
        fig, ax = plt.subplots(figsize=(12, 8))
        sns.histplot(df[col], bins=20, kde=False, ax=ax)
        ax.set_title(f"Encoded Distribution of {col}")
        st.pyplot(fig)

# Log Transformation and Scaling
scaler = StandardScaler()
if st.checkbox("Apply Log Transformation and Scaling"):
    df[numerical_cols] = np.log1p(df[numerical_cols])
    df_scaled = pd.DataFrame(scaler.fit_transform(df[numerical_cols]), columns=numerical_cols)
    st.write("Log transformation and scaling applied.")

# --- Step 3: Forecasting ---
st.subheader("Time Series Forecasting")

time_series_col = st.sidebar.selectbox("Select Time Series Column", df.columns)
target_column = st.sidebar.selectbox("Select Target Column for Forecasting", numerical_cols)

if time_series_col and target_column:
    try:
        df[time_series_col] = pd.to_datetime(df[time_series_col], errors='coerce')
        if df[time_series_col].isna().sum() > 0:
            st.error("Invalid time series column. Please select a valid datetime column.")
        else:
            df.set_index(time_series_col, inplace=True)

            # Prophet Forecasting
            st.write("**Prophet Forecasting**")
            prophet_df = df[[target_column]].reset_index().rename(columns={time_series_col: 'ds', target_column: 'y'})
            prophet_model = Prophet()
            prophet_model.fit(prophet_df)
            future = prophet_model.make_future_dataframe(periods=10)
            forecast = prophet_model.predict(future)
            st.write("Forecasted Values (Prophet):")
            st.write(forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail())

            fig = prophet_model.plot(forecast)
            st.pyplot(fig)
    except Exception as e:
        st.error(f"Error in forecasting: {e}")

# --- Step 4: Model Evaluation (Classification) ---
st.subheader("Classification Metrics")

classification_target = st.sidebar.selectbox("Select Target Column for Classification", df.columns)
if classification_target and st.checkbox("Train Classification Model"):
    feature_columns = st.sidebar.multiselect(
        "Select Features for Classification", [col for col in df.columns if col != classification_target]
    )
    if feature_columns:
        X = df[feature_columns]
        y = df[classification_target]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        clf = RandomForestClassifier(random_state=42)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        # Confusion Matrix
        cm = confusion_matrix(y_test, y_pred)
        st.write("**Confusion Matrix**")
        st.write(cm)

        # ROC Curve
        y_prob = clf.predict_proba(X_test)[:, 1]
        fpr, tpr, _ = roc_curve(y_test, y_prob)
        roc_auc = auc(fpr, tpr)
        st.write(f"**AUC-ROC Curve (AUC = {roc_auc:.2f})**")
        fig, ax = plt.subplots()
        ax.plot(fpr, tpr, label=f"ROC Curve (AUC = {roc_auc:.2f})")
        ax.plot([0, 1], [0, 1], linestyle="--", color="gray")
        ax.set_title("ROC Curve")
        ax.set_xlabel("False Positive Rate")
        ax.set_ylabel("True Positive Rate")
        ax.legend()
        st.pyplot(fig)

        # Precision-Recall Curve
        precision, recall, _ = precision_recall_curve(y_test, y_prob)
        st.write("**Precision-Recall Curve**")
        fig, ax = plt.subplots()
        ax.plot(recall, precision)
        ax.set_title("Precision-Recall Curve")
        ax.set_xlabel("Recall")
        ax.set_ylabel("Precision")
        st.pyplot(fig)

# --- Step 5: Regression Diagnostics ---
st.subheader("Regression Diagnostics")

if target_column and feature_columns:
    X = df[feature_columns]
    y = df[target_column]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = RandomForestRegressor(random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Residual Plot
    st.write("**Residuals vs Predicted Plot**")
    residuals = y_test - y_pred
    fig, ax = plt.subplots()
    sns.scatterplot(x=y_pred, y=residuals, ax=ax)
    ax.axhline(0, linestyle="--", color="red")
    ax.set_xlabel("Predicted Values")
    ax.set_ylabel("Residuals")
    ax.set_title("Residuals vs Predicted Values")
    st.pyplot(fig)

    # Q-Q Plot
    st.write("**Q-Q Plot**")
    fig = qqplot(residuals, line='s')
    st.pyplot(fig)


# Expose Streamlit App
import streamlit as st
st.title("Streamlit on Colab!")
st.write("This is a test Streamlit app running on Google Colab.")

Overwriting app.py


In [53]:
import os
from pyngrok import ngrok

# Start Streamlit in the background
os.system("streamlit run app.py &")

# Expose the Streamlit app using pyngrok
public_url = ngrok.connect(8501)
print(f"Streamlit is running on: {public_url}")

Streamlit is running on: NgrokTunnel: "https://238a-35-239-111-165.ngrok-free.app" -> "http://localhost:8501"


In [None]:
!streamlit run app.py &


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8502[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8502[0m
[34m  External URL: [0m[1mhttp://35.239.111.165:8502[0m
[0m
