In [None]:
import dash
from dash import dcc, html
import plotly.express as px
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.decomposition import PCA
import seaborn as sns

# Initialize Dash app
app = dash.Dash(__name__)

# Load your data
df = pd.read_csv("preprocessed_ecommerce_dataset.csv")
df_clean = df.fillna(df.mean())  # Preprocessing step

# Preprocessing data for model training
X = df_clean.drop(columns=["0"])  # Feature columns (excluding target column '0')
y = df_clean["0"]  # Target column '0'

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Models (regressors)
models = {
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42),
    "XGBoost": XGBRegressor(random_state=42)
}

# Evaluate models
def evaluate_model(X_train, X_test, y_train, y_test, model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return mse, r2

metrics_values = {model_name: [] for model_name in models.keys()}
for model_name, model in models.items():
    mse, r2 = evaluate_model(X_train, X_test, y_train, y_test, model)
    metrics_values[model_name] = [mse, r2]

# Create a DataFrame for metrics comparison
metrics_df = pd.DataFrame(metrics_values, index=["Mean Squared Error", "R-squared"])

# Correlation Heatmap
correlation_matrix = df_clean.corr()
fig_corr = px.imshow(correlation_matrix, color_continuous_scale='RdBu', title="Correlation Heatmap")

# Feature Importance using Random Forest
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)
importance = rf_model.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importance
}).sort_values(by='Importance', ascending=False)

fig_feature_importance = px.bar(feature_importance_df, x='Feature', y='Importance', title="Feature Importance - Random Forest")

# PCA Plot
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
pca_df = pd.DataFrame(X_pca, columns=["PCA1", "PCA2"])
pca_df['Target'] = y

fig_pca = px.scatter(pca_df, x="PCA1", y="PCA2", color="Target", title="PCA Plot")

# App layout
app.layout = html.Div([
    html.H1("E-commerce Data Analysis Dashboard"),
    
    html.Div([
        html.H3("Model Performance Metrics"),
        dcc.Graph(
            id="model-metrics",
            figure=px.bar(metrics_df.T, barmode='group', title="Model Performance Comparison")
        ),
    ]),

    html.Div([
        html.H3("Data Distribution - Before Preprocessing"),
        dcc.Graph(
            id="before-preprocessing-histogram",
            figure=px.histogram(df, x=df.columns[0], nbins=30, title="Histogram of First Column (Before Preprocessing)")
        ),
    ]),

    html.Div([
        html.H3("Data Distribution - After Preprocessing"),
        dcc.Graph(
            id="after-preprocessing-histogram",
            figure=px.histogram(df_clean, x=df_clean.columns[0], nbins=30, title="Histogram of First Column (After Preprocessing)")
        ),
    ]),

    html.Div([
        html.H3("Correlation Heatmap"),
        dcc.Graph(
            id="correlation-heatmap",
            figure=fig_corr
        ),
    ]),

    html.Div([
        html.H3("Feature Importance - Random Forest"),
        dcc.Graph(
            id="feature-importance",
            figure=fig_feature_importance
        ),
    ]),

    html.Div([
        html.H3("PCA Plot"),
        dcc.Graph(
            id="pca-plot",
            figure=fig_pca
        ),
    ]),
])

# Run the app
if __name__ == '__main__':
    app.run_server(debug=True)
