<a href="https://colab.research.google.com/github/MarMarhoun/freelance_work/blob/main/side_projects/NLP_projs/LLMs_with_Gradio/EDA_gradio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Gradio EDA Dashboard

In [1]:
!pip install pandas matplotlib seaborn gradio

Collecting gradio
  Downloading gradio-5.23.1-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.8.0 (from gradio)
  Downloading gradio_client-1.8.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 

In [57]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gradio as gr
import io
import base64

# Function to get data from the provided URL
def get_data(url: str) -> pd.DataFrame:
    try:
        return pd.read_csv(url)
    except Exception as e:
        raise ValueError(f"Error loading data: {str(e)}")

# Function to preprocess the dataset
def preprocess_data(df: pd.DataFrame) -> pd.DataFrame:
    # Handle missing values
    df = df.dropna()  # Drop rows with missing values; you can also fill them if needed

    # Convert categorical variables to numerical if necessary
    categorical_cols = df.select_dtypes(include=['object']).columns
    df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

    return df

# Function to get summary statistics
def summary_statistics(df):
    return df.describe()

# Function to plot distribution of a specific column
def plot_distribution(df, column):
    plt.figure(figsize=(10, 6))
    sns.histplot(df[column], kde=True)
    plt.title(f'Histogram of {column}')

    buf = io.BytesIO()
    plt.savefig(buf, format="png")
    buf.seek(0)
    plt.close()

    img_str = base64.b64encode(buf.read()).decode()
    return img_str  # Return the image string directly

# Function to plot correlation matrix
def plot_correlation_matrix(df):
    plt.figure(figsize=(10, 6))
    corr = df.corr()
    sns.heatmap(corr, annot=True, cmap='coolwarm')
    plt.title('Correlation Matrix')

    buf = io.BytesIO()
    plt.savefig(buf, format="png")
    buf.seek(0)
    plt.close()

    img_str = base64.b64encode(buf.read()).decode()
    return img_str  # Return the image string directly

# Main EDA function
def eda_dashboard(url):
    try:
        data = get_data(url)

        # Preprocess the data
        data = preprocess_data(data)

        # Get summary statistics
        basic_stats = summary_statistics(data)

        # Check if 'loan' column exists for distribution plotting
        if 'loan' in data.columns:
            loan_distribution_image = plot_distribution(data, 'loan')  # Display the distribution of the 'loan' column
        else:
            loan_distribution_image = None

        # Generate correlation matrix
        correlation_image = plot_correlation_matrix(data)

        return basic_stats, loan_distribution_image, correlation_image
    except Exception as e:
        return str(e), None, None

# Create Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("<h1 style='text-align: center;'>Enhanced EDA Dashboard with Gradio</h1>")

    url_input = gr.Textbox(label="Enter Dataset URL", placeholder="Paste your CSV URL here...")
    load_data_button = gr.Button("Load Data")

    # Outputs
    basic_stats_output = gr.Dataframe(label="Basic Statistics")
    plot_selection = gr.Radio(label="Select Plot", choices=["Loan Distribution", "Correlation Matrix"], value="Loan Distribution")
    plot_output = gr.Image(label="Selected Plot")

    # Update outputs when the button is clicked
    # ... (previous code) ...

    # Update outputs when the button is clicked
    def update_outputs(url):
        basic_stats, loan_distribution_image, correlation_image = eda_dashboard(url)
        return basic_stats, loan_distribution_image, correlation_image

    load_data_button.click(fn=update_outputs, inputs=url_input, outputs=[basic_stats_output, plot_output])

    # Update plot based on selection
    def update_plot(selected_plot, loan_image, correlation_image):
        if selected_plot == "Loan Distribution":
            return loan_image
        elif selected_plot == "Correlation Matrix":
            return correlation_image
        else:
            return None

    # Store loan_distribution_image and correlation_image as State
    loan_distribution_image_state = gr.State(None)  # Initialize as None
    correlation_image_state = gr.State(None)  # Initialize as None

    # Update states in update_outputs
    def update_outputs(url):
        basic_stats, loan_distribution_image, correlation_image = eda_dashboard(url)
        loan_distribution_image_state.value = loan_distribution_image
        correlation_image_state.value = correlation_image
        return basic_stats, loan_distribution_image, correlation_image

    load_data_button.click(fn=update_outputs, inputs=url_input, outputs=[basic_stats_output, plot_output])

    # Update plot based on selection
    plot_selection.change(fn=update_plot,
                          inputs=[plot_selection, loan_distribution_image_state, correlation_image_state],  # Use state variables
                          outputs=plot_output)
# ... (rest of the code) ...

    # Example URLs
    gr.Markdown("### Example URLs:")
    gr.Markdown("* Bank Marketing Dataset: https://raw.githubusercontent.com/Lexie88rus/bank-marketing-analysis/master/bank.csv")
    gr.Markdown("* Titanic Dataset: https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv")
    gr.Markdown("* Wine Quality Dataset: https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-02-11/wine_quality.csv")
    gr.Markdown("* Airbnb Dataset: https://raw.githubusercontent.com/insideairbnb/insideairbnb/master/data/listings.csv")
    gr.Markdown("* IMDB Dataset: https://raw.githubusercontent.com/IMDb-Data-Scraper/IMDb-Data-Scraper/master/data/movies.csv")

# Launch the Gradio app
demo.launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://59287a0c7cac34b069.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




Dataset:         return pd.read_csv("https://raw.githubusercontent.com/Lexie88rus/bank-marketing-analysis/master/bank.csv")


In [27]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gradio as gr
import io
import base64
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

# Function to get data from the URL or uploaded file
def get_data(uploaded_file=None) -> pd.DataFrame:
    try:
        if uploaded_file is not None:
            if uploaded_file.name.endswith('.csv'):
                return pd.read_csv(uploaded_file)
            elif uploaded_file.name.endswith('.txt'):
                return pd.read_csv(uploaded_file, sep="\t")
            else:
                raise ValueError("Unsupported File Format. Please upload a csv or txt file.")
        else:
            return pd.read_csv("https://raw.githubusercontent.com/Lexie88rus/bank-marketing-analysis/master/bank.csv")
    except Exception as e:
        raise ValueError(f"Error loading data: {str(e)}")

# Function to get summary statistics
def summary_statistics(df):
    return df.describe()

# Function to check for missing values
def missing_values(df):
    return df.isnull().sum()

# Function to plot distribution
def plot_distribution(df, column):
    plt.figure(figsize=(10, 6))
    sns.histplot(df[column], kde=True)
    plt.title(f'Histogram of {column}')

    buf = io.BytesIO()
    plt.savefig(buf, format="png")
    buf.seek(0)
    plt.close()

    img_str = base64.b64encode(buf.read()).decode()
    return f"data:image/png;base64,{img_str}"

# Function to plot correlation matrix
def plot_correlation_matrix(df):
    plt.figure(figsize=(10, 6))
    corr = df.corr()
    sns.heatmap(corr, annot=True, cmap='coolwarm')
    plt.title('Correlation Matrix')

    buf = io.BytesIO()
    plt.savefig(buf, format="png")
    buf.seek(0)
    plt.close()

    img_str = base64.b64encode(buf.read()).decode()
    return f"data:image/png;base64,{img_str}"

# Function to train models and return accuracies
def train_models(X_train, y_train, X_test, y_test, selected_models):
    models = {
        "Logistic Regression": LogisticRegression(),
        "Decision Tree": DecisionTreeClassifier(),
        "Random Forest": RandomForestClassifier(),
        "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
        "Support Vector Machine": SVC(gamma='auto')
    }

    accuracies = {}
    for name in selected_models:
        model = models[name]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracies[name] = {
            "accuracy": accuracy_score(y_test, y_pred),
            "confusion_matrix": confusion_matrix(y_test, y_pred).tolist(),
            "classification_report": classification_report(y_test, y_pred, output_dict=True)
        }

    return accuracies

# Main EDA function
def eda_dashboard(uploaded_file, selected_models):
    global data
    data = get_data(uploaded_file)

    # Display dataset details
    dataset_preview = data.head().to_dict(orient="records")
    basic_stats = summary_statistics(data).to_dict(orient="records")

    # Data cleaning
    initial_shape = data.shape[0]
    data = data.dropna()
    dropped_rows = initial_shape - data.shape[0]

    # Prepare data for machine learning
    if 'target_column' in data.columns:  # Replace 'target_column' with your actual target column name
        X = data.drop('target_column', axis=1)
        y = data['target_column']

        # Split the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state= 42)

        # Train the models and get accuracies
        accuracies = train_models(X_train, y_train, X_test, y_test, selected_models)
    else:
        accuracies = "Target column not found for model training."

    # Generate visualizations
    histogram_image = plot_distribution(data, 'column_name')  # Replace 'column_name' with a valid column
    correlation_image = plot_correlation_matrix(data)

    return {
        "Dataset Preview": dataset_preview,
        "Basic Statistics": basic_stats,
        "Dropped Rows": dropped_rows,
        "Model Accuracies": accuracies,
        "Histogram": histogram_image,
        "Correlation Matrix": correlation_image
    }

# Create Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# Enhanced EDA Dashboard with Machine Learning")

    # File uploader for custom dataset
    uploaded_file = gr.File(label="Upload a custom dataset (CSV or TXT)", file_types=["csv", "txt"])

    # Model selection
    model_selection = gr.CheckboxGroup(
        label="Select Machine Learning Models to Train",
        choices=["Logistic Regression", "Decision Tree", "Random Forest", "K-Nearest Neighbors", "Support Vector Machine"],
        value=["Logistic Regression"]  # Default selection
    )

    # Outputs
    dataset_preview_output = gr.Dataframe(label="Dataset Preview", interactive=False)
    basic_stats_output = gr.Dataframe(label="Basic Statistics")
    dropped_rows_output = gr.Textbox(label="Dropped Rows")
    model_accuracies_output = gr.JSON(label="Model Accuracies")
    histogram_output = gr.Image(label="Histogram")
    correlation_output = gr.Image(label="Correlation Matrix")

    # Update outputs when a file is uploaded
    def update_outputs(uploaded_file, selected_models):
        results = eda_dashboard(uploaded_file, selected_models)
        return (
            results["Dataset Preview"],
            results["Basic Statistics"],
            str(results["Dropped Rows"]),
            results["Model Accuracies"],
            results["Histogram"],
            results["Correlation Matrix"]
        )

    uploaded_file.change(fn=update_outputs, inputs=[uploaded_file, model_selection], outputs=[
        dataset_preview_output, basic_stats_output, dropped_rows_output, model_accuracies_output, histogram_output, correlation_output
    ])

# Launch the Gradio app
demo.launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://b943d8cb3ba3e5e4e7.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [33]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gradio as gr
import io
import base64
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

# Function to get data from the URL
def get_data() -> pd.DataFrame:
    try:
        return pd.read_csv("https://raw.githubusercontent.com/Lexie88rus/bank-marketing-analysis/master/bank.csv")
    except Exception as e:
        raise ValueError(f"Error loading data: {str(e)}")

# Function to get summary statistics
def summary_statistics(df):
    return df.describe()

# Function to plot distribution
def plot_distribution(df, column):
    plt.figure(figsize=(10, 6))
    sns.histplot(df[column], kde=True)
    plt.title(f'Histogram of {column}')

    buf = io.BytesIO()
    plt.savefig(buf, format="png")
    buf.seek(0)
    plt.close()

    img_str = base64.b64encode(buf.read()).decode()
    return f"data:image/png;base64,{img_str}"

# Function to plot correlation matrix
def plot_correlation_matrix(df):
    plt.figure(figsize=(10, 6))
    corr = df.corr()
    sns.heatmap(corr, annot=True, cmap='coolwarm')
    plt.title('Correlation Matrix')

    buf = io.BytesIO()
    plt.savefig(buf, format="png")
    buf.seek(0)
    plt.close()

    img_str = base64.b64encode(buf.read()).decode()
    return f"data:image/png;base64,{img_str}"

# Function to train models and return accuracies
def train_models(X_train, y_train, X_test, y_test, selected_models):
    models = {
        "Logistic Regression": LogisticRegression(),
        "Decision Tree": DecisionTreeClassifier(),
        "Random Forest": RandomForestClassifier(),
        "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
        "Support Vector Machine": SVC(gamma='auto')
    }

    accuracies = {}
    for name in selected_models:
        model = models[name]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracies[name] = {
            "accuracy": accuracy_score(y_test, y_pred),
            "confusion_matrix": confusion_matrix(y_test, y_pred).tolist(),
            "classification_report": classification_report(y_test, y_pred, output_dict=True)
        }

    return accuracies

# Main EDA function
def eda_dashboard(selected_models, selected_column):
    global data
    data = get_data()

    # Display dataset details
    dataset_preview = data.head().to_dict(orient="records")  # First few rows of the dataset
    basic_stats = summary_statistics(data).to_dict(orient="records")  # Summary statistics

    # Data cleaning
    initial_shape = data.shape[0]
    data = data.dropna()
    dropped_rows = initial_shape - data.shape[0]

    # Prepare data for machine learning
    if 'y' in data.columns:  # Replace 'y' with your actual target column name
        X = data.drop('y', axis=1)
        y = data['y']

        # Split the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Train the models and get accuracies
        accuracies = train_models(X_train, y_train, X_test, y_test, selected_models)
    else:
        accuracies = "Target column not found for model training."

    # Generate visualizations
    histogram_image = plot_distribution(data, selected_column)  # Use the selected column for the histogram
    correlation_image = plot_correlation_matrix(data)

    return {
        "Dataset Preview": dataset_preview,
        "Basic Statistics": basic_stats,
        "Dropped Rows": dropped_rows,
        "Model Accuracies": accuracies,
        "Histogram": histogram_image,
        "Correlation Matrix": correlation_image
    }

# Create Gradio interface
# Create Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# EDA Dashboard with Machine Learning & Gradio", elem_id="title")
    gr.HTML("<style>#title {text-align: center;}</style>")

    # Sidebar for user inputs
    with gr.Row():
        with gr.Column():
            selected_models = gr.CheckboxGroup(
                label="Select Models to Train",
                choices=["Logistic Regression", "Decision Tree", "Random Forest", "K-Nearest Neighbors", "Support Vector Machine"],
                value=["Logistic Regression", "Decision Tree"]
            )
            selected_column = gr.Dropdown(
                label="Select Column for Histogram",
                choices=["age", "job", "marital", "education", "default", "balance", "housing", "loan", "contact", "day", "month", "duration", "campaign", "pdays", "previous", "poutcome"],  # Populate with actual column names
                value="age"  # Default to the first column
            )
            load_data_button = gr.Button("Load Data")

    # Outputs
    dataset_preview_output = gr.Dataframe(label="Dataset Preview", interactive=False)
    basic_stats_output = gr.Dataframe(label="Basic Statistics")
    dropped_rows_output = gr.Textbox(label="Dropped Rows")
    model_accuracies_output = gr.JSON(label="Model Accuracies")
    histogram_output = gr.Image(label="Histogram")
    correlation_output = gr.Image(label="Correlation Matrix")

    # Update outputs when the button is clicked
    def update_outputs(selected_models, selected_column):
        results = eda_dashboard(selected_models, selected_column)
        return (
            results["Dataset Preview"],
            results["Basic Statistics"],  # This will now show the summary statistics
            str(results["Dropped Rows"]),
            results["Model Accuracies"],
            results["Histogram"],
            results["Correlation Matrix"]
        )

    load_data_button.click(fn=update_outputs, inputs=[selected_models, selected_column], outputs=[
        dataset_preview_output, basic_stats_output, dropped_rows_output, model_accuracies_output, histogram_output, correlation_output
    ])

# Launch the Gradio app
demo.launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://c6afa966bcc337cda3.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [38]:
import time  # to simulate a real time data, time loop
import numpy as np  # np mean, np random
import pandas as pd  # read csv, df manipulation
import plotly.express as px  # interactive charts

def get_data() -> pd.DataFrame:
    return pd.read_csv("https://raw.githubusercontent.com/Lexie88rus/bank-marketing-analysis/master/bank.csv")

df = get_data()


In [39]:
df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
3,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes
4,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11157,33,blue-collar,single,primary,no,1,yes,no,cellular,20,apr,257,1,-1,0,unknown,no
11158,39,services,married,secondary,no,733,no,no,unknown,16,jun,83,4,-1,0,unknown,no
11159,32,technician,single,secondary,no,29,no,no,cellular,19,aug,156,2,-1,0,unknown,no
11160,43,technician,married,secondary,no,0,no,yes,cellular,8,may,9,2,172,5,failure,no


In [19]:
df.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,11162.0,11162.0,11162.0,11162.0,11162.0,11162.0,11162.0
mean,41.231948,1528.538524,15.658036,371.993818,2.508421,51.330407,0.832557
std,11.913369,3225.413326,8.42074,347.128386,2.722077,108.758282,2.292007
min,18.0,-6847.0,1.0,2.0,1.0,-1.0,0.0
25%,32.0,122.0,8.0,138.0,1.0,-1.0,0.0
50%,39.0,550.0,15.0,255.0,2.0,-1.0,0.0
75%,49.0,1708.0,22.0,496.0,3.0,20.75,1.0
max,95.0,81204.0,31.0,3881.0,63.0,854.0,58.0


# Gradio EDA Dashboard - Customer segmentation

