In [3]:
! pip install streamlit lime pyngrok xgboost shap



In [4]:
%%writefile Home.py
import streamlit as st

st.set_page_config(
    page_title="AutoML Application",
    page_icon="ü§ñ",
    layout="centered"
)

st.title("ü§ñ AutoML Application")
st.subheader("Build, Train, Predict & Explain ML Models Automatically")

st.write("""
---

### üì§ **Step 1: Upload Dataset**
Go to the **Drop** tab and upload your dataset (CSV / Excel).

The application will:
- Preview your dataset
- Let you select the target (prediction) column
- Allow dropping unnecessary columns (ID, serial number, etc.)

---

### üßπ **Step 2: Automatic Preprocessing**
Once you confirm your selections, the app will automatically:
- Detect **classification or regression**
- Handle missing values
- Encode categorical features
- Remove duplicates and extreme outliers
- Detect **class imbalance**
- Apply **SMOTE automatically (only for training data)** when required

> ‚ö†Ô∏è SMOTE is applied safely to avoid data leakage.

---

### üèãÔ∏è **Step 3: Intelligent Model Training**
The system trains **multiple ML models automatically**, including:
- Logistic / Linear Regression
- Support Vector Machines (SVM)
- KNN
- Decision Tree
- Random Forest
- XGBoost

Each model is evaluated using:
- **ROC-AUC** (Classification)
- **R¬≤ Score** (Regression)

‚úÖ The best baseline model is selected
‚úÖ Hyperparameter tuning is applied **only if it improves performance**
‚úÖ The final best model is stored automatically

---

### üéØ **Step 4: Prediction**
After training:
- Input fields are generated dynamically from your dataset
- Supports both numerical & categorical features
- One-click prediction
- Shows:
  - Predicted class / value
  - Prediction probability (for classification)

You can also:
- Save predictions
- Download prediction history as CSV

---

### üîç **Step 5: Model Explainability (SHAP / LIME)**
Understand **why** the model made a prediction:
- SHAP explanations for supported models
- Automatic fallback when SHAP/LIME is not supported
- Feature impact visualization
- Safe handling for models without probability output

> If a model cannot be explained, the app clearly informs the user.

---

### üéØ **Purpose of This Application**
This AutoML platform is designed to:
- Enable **non-technical users** to build ML models
- Automate the **entire ML pipeline**
- Prevent common ML mistakes (data leakage, imbalance bias)
- Provide **transparent & explainable predictions**
- Work with **any structured dataset**

---

### üöÄ How to Start?
‚û°Ô∏è Upload a dataset from the **Drop** page to begin.
""")

st.success("‚úî Ready to build intelligent ML models without writing code!")


Writing Home.py


In [5]:
!mkdir -p pages

In [6]:
%%writefile pages/1_Upload.py
import streamlit as st
import numpy as np
import zipfile
import os
import kagglehub
from functools import reduce
import pandas as pd
st.set_page_config(page_title="AutoML",page_icon='üëæ',layout='centered')

if  "df"  in st.session_state:
  st.title("üì• Dataset upload")
  st.success("‚úîÔ∏è File already uploaded")
  st.write(f"**Preview: {st.session_state['dataset']}**")
  st.dataframe(st.session_state['df'].head())
else:
  def dataframe(Ufiles):
    try:
      dframe = pd.read_csv(Ufiles)
    except Exception:
      try:
        dframe = pd.read_excel(Ufiles)
      except Exception as e:
        st.error("‚ùå Could not read file. Unsupported format")
        st.stop()
    return dframe

  def Multi_dataframe(path,Ufiles):
      all_df=[]
      for file_name in Ufiles:
          st.write(file_name)
          if file_name.endswith(("csv", "xls", "xlsx", "xls", "xlsb", "xlsm", "ods")):
            all_df.append(dataframe(os.path.join(path,file_name)))
      df_common = set(all_df[0].columns)
      for dfs in all_df[1:]:
        df_common = df_common.intersection(dfs.columns)
      df_common = list(df_common)
      st.write(f"**üé∞Common features: {df_common[0]}**")
      df_merged = reduce(lambda df1, df2: pd.merge(df1, df2, on=df_common, how='outer'), all_df)
      return df_merged

  st.title("üì• Upload Dataset or Enter URL")
  uploaded_file = st.file_uploader("Upload CSV/Excel file or ZIP  file", type=["csv", "xls", "xlsx", "xls", "xlsb", "xlsm", "ods","zip"])
  st.write("***")
  data_url = st.text_input("Enter URL :",placeholder="Paste a direct CSV/Excel or Kaggle Hub link here")
  df=None

  if uploaded_file is not None:
    if uploaded_file.name.split('.')[-1] =="zip":
      path="Data_folder"
      st.write("Multiple datasets detected...")
      with zipfile.ZipFile(uploaded_file, "r") as zip_ref:
        zip_ref.extractall(path)
        files = os.listdir(path)
        with st.spinner("Merging all Dataset"):
          df=Multi_dataframe(path,files)
        st.session_state['df']=df
        st.session_state["dataset"]=uploaded_file.name.split('.')[0]
    else:
      df=dataframe(uploaded_file)
      st.success("File uploaded successfully!")
      st.session_state['df']=df
      st.session_state["dataset"]=uploaded_file.name.split('.')[0]
  elif data_url:
    if data_url.split('.')[-1] not in ("csv", "xls", "xlsx", "xls", "xlsb", "xlsm", "ods"):
      st.write("**Kaggle hub dataset**")
      path = kagglehub.dataset_download(data_url)
      files = os.listdir(path)
      if len(files)>1:
        st.write("**üìöMultiple datasets detected**")
        with st.spinner("**üìíMerging all Dataset**"):
          df=Multi_dataframe(path,files)
      else:
        df = dataframe(os.path.join(path, files[0]))
    else:
      df=dataframe(data_url)
    st.session_state["dataset"]=data_url.split('/')[-1]
    st.session_state['df']=df
  else:
    pass
  if df is not None:
    st.write(f"**Preview: {st.session_state['dataset']}**")
    st.dataframe(st.session_state['df'].head())


Writing pages/1_Upload.py


In [8]:
%%writefile pages/2_preprocessing.py
import streamlit as st
import pandas as pd
import numpy as np
import time
from sklearn.preprocessing import LabelEncoder
from collections import Counter

st.set_page_config(page_title="Drop_Test_Predict", page_icon='üëæ', layout='centered')
st.title("ü§ñ Train Model")

# ---------------- CHECK DATA ----------------
if "df" not in st.session_state:
    st.error("‚ùå No dataset uploaded. Go to Drop page first.")
    st.stop()

# If already preprocessed
if "X" in st.session_state and "Y" in st.session_state:
    st.success("‚ú® Preprocessing Complete! Data is ready for training üöÄ")
    st.write("üßÆ Features Preview")
    st.dataframe(st.session_state["X"].head())
    st.write("üéØ Target Preview")
    st.dataframe(st.session_state["Y"].head())
    st.stop()

df = st.session_state["df"]
st.dataframe(df.head())

# ---------------- USER INPUT ----------------
target_col = st.selectbox(
    "üéØ Select Target Column (Prediction Output Column)",
    options=df.columns
)

drop_cols = st.multiselect(
    "üóë Select Columns to Drop (ID / Serial / Irrelevant)",
    options=[c for c in df.columns if c != target_col]
)

# ---------------- HELPER ----------------
def check_imbalance(y, threshold=0.4):
    counts = Counter(y.squeeze())
    ratio = min(counts.values()) / max(counts.values())
    return ratio < threshold, ratio, dict(counts)

# ---------------- APPLY ----------------
if st.button("Apply Selection"):

    df = df.drop(columns=drop_cols)

    target = df[target_col]

    # Detect problem type
    if target.dtype in ["int64", "float64"]:
        p_type = "classification" if target.nunique() <= 20 else "regression"
    else:
        p_type = "classification"

    st.session_state["p_type"] = p_type
    st.write(f"Detected problem type: **{p_type.upper()}**")

    with st.spinner("üöÄ Preprocessing dataset..."):

        # Remove duplicates & missing target
        df = df.drop_duplicates()
        df = df.dropna(subset=[target_col])

        X = df.drop(columns=target_col)
        Y = df[target_col]

        # Store categorical options
        cat_options = {}
        for col in X.select_dtypes(include="object"):
            cat_options[col] = X[col].unique().tolist()

        st.session_state["cat_options"] = cat_options

        # Handle missing values & encoding
        encoders = {}
        for col in X.columns:
            if X[col].isna().mean() > 0.5:
                X.drop(columns=col, inplace=True)
                continue

            if X[col].dtype in ["int64", "float64"]:
                X[col].fillna(X[col].median(), inplace=True)
                Q1 = X[col].quantile(0.25)
                Q3 = X[col].quantile(0.75)
                IQR = Q3 - Q1
                lower_bound = Q1 - 1.5 * IQR
                upper_bound = Q3 + 1.5 * IQR
                no_of_outliers=((X[col]<lower_bound)|(X[col]>upper_bound)).sum()
                if (no_of_outliers/len(X[col]))>0.1:
                  X[col] = X[col].clip(lower=lower_bound, upper=upper_bound)
            else:
                X[col].fillna(X[col].mode()[0], inplace=True)
                le = LabelEncoder()
                X[col] = le.fit_transform(X[col].astype(str))
                encoders[col] = le

        st.session_state["encoders"] = encoders

        # Target handling
        if p_type == "classification":
            target_encoder = LabelEncoder()
            Y = pd.DataFrame(
                target_encoder.fit_transform(Y.astype(str)),
                columns=[target_col]
            )
            st.session_state["target_encoder"] = target_encoder

            # ---- IMBALANCE CHECK ----
            is_imbalanced, ratio, dist = check_imbalance(Y)
            st.session_state["is_imbalanced"] = is_imbalanced
            st.session_state["imbalance_ratio"] = ratio
            st.session_state["class_distribution"] = dist

            if is_imbalanced:
                st.warning(
                    f"‚ö† Dataset is imbalanced (ratio={ratio:.2f}). "
                    "SMOTE will be applied during training."
                )
            else:
                st.success("‚úî Dataset is balanced")

        else:
            #Regression outlier handling
            Q1, Q3 = Y.quantile(0.25), Y.quantile(0.75)
            IQR = Q3 - Q1
            Y = Y.clip(Q1 - 1.5 * IQR, Q3 + 1.5 * IQR)


        time.sleep(1)

        st.session_state["X"] = X
        st.session_state["Y"] = Y

        st.success("‚ú® Preprocessing Complete!")
        st.dataframe(X.head())
        st.dataframe(Y.head())


Overwriting pages/2_preprocessing.py


In [9]:
%%writefile pages/3_train.py
import streamlit as st
import pandas as pd
import numpy as np
import time

from sklearn.base import clone
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC, SVR
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from xgboost import XGBClassifier, XGBRegressor

from imblearn.over_sampling import SMOTE

# ---------------- STREAMLIT ----------------
st.set_page_config(page_title="Train", page_icon="üèãÔ∏è", layout="centered")
st.title("üîÑ Model Training")

if "X" not in st.session_state or "Y" not in st.session_state:
    st.error("‚ùå Please preprocess the dataset first.")
    st.stop()

if "best_model" in st.session_state:
    st.success("üèÜ Model already trained")
    st.write(f"**{st.session_state['best_tuned_model_name']}**")
    st.write(f"Score: {st.session_state['best_tuned_score']:.4f}")
    st.stop()

# ---------------- LOAD DATA ----------------
X = st.session_state["X"]
Y = st.session_state["Y"]
p_type = st.session_state["p_type"]

# Stratify if classification
stratify = Y if p_type == "classification" else None

X_train, X_test, y_train, y_test = train_test_split(
    X, Y,
    test_size=0.25,
    random_state=42,
    stratify=stratify
)

# ---------------- SMOTE ----------------
if p_type == "classification" and st.session_state.get("is_imbalanced", False):
    min_class = min(st.session_state["class_distribution"].values())
    if min_class >= 10:
        smote = SMOTE(random_state=42)
        X_train, y_train = smote.fit_resample(X_train, y_train)
        st.info("‚öñ SMOTE applied on training data")
    else:
        st.warning("‚ö† Too few minority samples ‚Äî SMOTE skipped")

# ---------------- MODELS ----------------
if p_type == "classification":
    models = {
        "Logistic Regression": Pipeline([
            ("scaler", StandardScaler()),
            ("model", LogisticRegression(max_iter=1000))
        ]),
        "SVM": Pipeline([
            ("scaler", StandardScaler()),
            ("model", SVC(probability=True))
        ]),
        "KNN": Pipeline([
            ("scaler", StandardScaler()),
            ("model", KNeighborsClassifier())
        ]),
        "Random Forest": RandomForestClassifier(random_state=42, class_weight="balanced"),
        "Decision Tree": DecisionTreeClassifier(random_state=42, class_weight="balanced"),
        "XGBoost": XGBClassifier(eval_metric="logloss", random_state=42)
    }
else:
    models = {
        "Linear Regression": Pipeline([
            ("scaler", StandardScaler()),
            ("model", LinearRegression())
        ]),
        "SVR": Pipeline([
            ("scaler", StandardScaler()),
            ("model", SVR())
        ]),
        "KNN Regressor": Pipeline([
            ("scaler", StandardScaler()),
            ("model", KNeighborsRegressor())
        ]),
        "Random Forest Regressor": RandomForestRegressor(random_state=42),
        "Decision Tree Regressor": DecisionTreeRegressor(random_state=42),
        "XGBoost Regressor": XGBRegressor(random_state=42)
    }

# ---------------- BASELINE ----------------
results = {}

with st.spinner("üîç Training baseline models..."):
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        if p_type == "classification":
            if len(np.unique(y_test)) == 2:
                score = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
            else:
                score = roc_auc_score(
                    y_test, model.predict_proba(X_test), multi_class="ovr"
                )
        else:
            score = r2_score(y_test, y_pred)

        results[name] = score

results_df = (
    pd.DataFrame(results.items(), columns=["Model", "Score"])
    .sort_values("Score", ascending=False)
    .reset_index(drop=True)
)

st.success("‚úÖ Baseline training complete")
st.dataframe(results_df.head(3))

# ---------------- SELECT BEST ----------------
best_model_name = results_df.iloc[0]["Model"]
best_score = results_df.iloc[0]["Score"]

best_model = clone(models[best_model_name])
best_model.fit(X_train, y_train)

# ---------------- SAVE ----------------
st.success("üèÜ Final Best Model Selected")
st.write(f"**{best_model_name}**")
st.write(f"Score: {best_score:.4f}")

st.session_state["best_model"] = best_model
st.session_state["best_tuned_model_name"] = best_model_name
st.session_state["best_tuned_score"] = best_score



Writing pages/3_train.py


In [10]:
%%writefile pages/4_predict.py
import streamlit as st
import pandas as pd
import numpy as np
import os
st.set_page_config(page_title="Predict", page_icon="üéØ", layout="centered")
st.title("üîÆ Make Prediction")

# Check if model exists
if "best_model" not in st.session_state or "X" not in st.session_state:
    st.error("‚ùå Model not trained yet. Please go to Train page first.")
    st.stop()

model = st.session_state["best_model"]
X_train_cols = st.session_state["X"].columns   # processed columns
p_type = st.session_state["p_type"]
encoders = st.session_state.get("encoders", {})
target_encoder = st.session_state.get("target_encoder", None)
cat_values = st.session_state.get("cat_options", {})

st.write("### üìù Enter values for prediction:")

# Create input controls dynamically
input_values = {}
for col in X_train_cols:
    if col in encoders:  # categorical
        options = cat_values[col] if col in cat_values else list(encoders[col].classes_)
        input_values[col] = st.selectbox(f"{col}", options=options)
    else:
        input_values[col] = st.number_input(f"{col}", value=float(st.session_state["X"][col].median()))

# Convert input to dataframe
input_df = pd.DataFrame([input_values])
st.session_state['input_df']=input_df
# Encode categorical values
for col in input_df.columns:
    if col in encoders:
        try:
            input_df[col] = encoders[col].transform(input_df[col].astype(str))
        except:
            st.warning(f"‚ö† Unknown category in '{col}'. Using default value.")
            input_df[col] = encoders[col].transform([encoders[col].classes_[0]])

# Prediction
if st.button("Predict"):
  try:
    prediction = model.predict(input_df)[0]
    st.session_state['prediction'] = prediction  # store prediction
    st.session_state['last_input'] = input_df
    if p_type == "classification":
        proba = model.predict_proba(input_df)[0]
        st.session_state['proba']=proba

        if target_encoder:
            prediction_label = target_encoder.inverse_transform([int(prediction)])[0]
        else:
            prediction_label = prediction

        st.success(f"üéØ Predicted Class: **{prediction_label}**")
    else:  # Regression
        st.success(f"üìå Predicted Value: **{prediction:.4f}**")

  except Exception as e:
        st.error(f"‚ùå Prediction failed: {e}")
# ---- PRINT BUTTON ----
if st.button("Print / Save Result"):
    if "prediction" not in st.session_state:
        st.error("‚ö† Please make a prediction first.")
    else:
        save_row = st.session_state['last_input'].copy()
        save_row["prediction"] = st.session_state['prediction']

        if p_type == "classification":
            save_row["probability"] = float(st.session_state["proba"].max())

        save_file = "prediction_history.csv"

        # Write/append to file
        if not os.path.exists(save_file):
            save_row.to_csv(save_file, mode='w', header=True, index=False)
        else:
            save_row.to_csv(save_file, mode='a', header=False, index=False)

        st.success("üìÅ Prediction saved successfully!")

        # Add download button
        df_saved = pd.read_csv(save_file)
        st.download_button(
            label="üì• Download Prediction History",
            data=df_saved.to_csv(index=False),
            file_name="prediction_history.csv",
            mime="text/csv"
        )



Writing pages/4_predict.py


In [11]:
%%writefile pages/5_Shap.py
import streamlit as st
import pandas as pd
import numpy as np
import shap
import matplotlib.pyplot as plt
from lime.lime_tabular import LimeTabularExplainer

st.set_page_config(page_title="Explainability", page_icon="üîç", layout="centered")
st.title("‚öñÔ∏è Model Explainability (SHAP / LIME)")

# ------------------ VALIDATION ------------------
if "input_df" not in st.session_state or "best_model" not in st.session_state:
    st.error("‚ùå Make a prediction first to view explainability.")
    st.stop()

model = st.session_state["best_model"]
input_df = st.session_state["input_df"]
X_train = st.session_state["X"]
Y_train = st.session_state["Y"]
p_type = st.session_state["p_type"]
prediction = st.session_state["prediction"]
target_encoder = st.session_state.get("target_encoder", None)

explained = False   # track whether explanation succeeded

# ======================================================
# üîπ TRY SHAP
# ======================================================
try:
    st.info("ü™µ Trying SHAP explanation...")

    shap.initjs()
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(input_df)

    if p_type == "classification":
        pred_idx = int(prediction)

        if isinstance(shap_values, list):  # multiclass
            values = shap_values[pred_idx][0]
            base = explainer.expected_value[pred_idx]
        else:  # binary
            values = shap_values[0]
            base = explainer.expected_value
    else:
        values = shap_values[0]
        base = explainer.expected_value

    explanation = shap.Explanation(
        values=values,
        base_values=base,
        data=input_df.iloc[0].values,
        feature_names=input_df.columns.tolist()
    )

    st.success("‚úî SHAP explanation generated")

    fig, ax = plt.subplots(figsize=(10, 6))
    shap.plots.waterfall(explanation, show=False)
    st.pyplot(fig)

    explained = True

except Exception:
    st.warning("‚ö† SHAP not supported for this model.")

# ======================================================
# üîπ TRY LIME (ONLY IF SHAP FAILED)
# ======================================================
if not explained:
    try:
        st.info("üçã Trying LIME explanation...")

        if p_type == "classification" and not hasattr(model, "predict_proba"):
            raise NotImplementedError("Model has no probability scores")

        lime_explainer = LimeTabularExplainer(
            training_data=X_train.values,
            feature_names=X_train.columns.tolist(),
            class_names=np.unique(Y_train).astype(str),
            discretize_continuous=True
        )

        if p_type == "classification":
            pred_idx = int(prediction)

            label_name = (
                target_encoder.inverse_transform([pred_idx])[0]
                if target_encoder else pred_idx
            )

            lime_exp = lime_explainer.explain_instance(
                input_df.values[0],
                model.predict_proba,
                num_features=10,
                top_labels=1
            )

            st.success(f"üéØ LIME explanation for class: **{label_name}**")
            st.pyplot(lime_exp.as_pyplot_figure(label=pred_idx))

        else:
            lime_exp = lime_explainer.explain_instance(
                input_df.values[0],
                model.predict,
                num_features=10
            )

            st.success("üìà LIME explanation (Regression)")
            st.pyplot(lime_exp.as_pyplot_figure())

        explained = True

    except Exception:
        pass

# ======================================================
# ‚ùå FINAL FALLBACK
# ======================================================
if not explained:
    st.error(
        "‚ùå This model cannot be explained using SHAP or LIME.\n\n"
        "Reason:\n"
        "- The selected model does not support probability outputs\n"
        "- SHAP TreeExplainer is not compatible\n\n"
    )


Writing pages/5_Shap.py


In [12]:
from pyngrok import ngrok
!ngrok config add-authtoken 35K0ERpK7SgOnSLLKN1IF9ov355_jhsFikHcuWL9dPugML5i

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [13]:
from pyngrok import ngrok
# Kill any previous tunnels
ngrok.kill()

# Start Streamlit in the background on port 8051
get_ipython().system_raw('streamlit run Home.py --server.port 8502 &')

public_url = ngrok.connect(8502)
print("Click the public URL to open your app üëá")
print(public_url)

Click the public URL to open your app üëá
NgrokTunnel: "https://shella-unmilitary-laically.ngrok-free.dev" -> "http://localhost:8502"
