In [1]:
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
import mlflow
import mlflow.sklearn

# Streamlit Configuration
st.set_page_config(
    page_title="Jewellery Price Optimization",
    layout="wide",
    initial_sidebar_state="expanded",
)

# Helper Functions
def split_data(df, target_column='Price_USD', test_size=0.3, random_state=42):
    X = df.drop(columns=[target_column])
    y = df[target_column]
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

def create_pipeline():
    categorical_features = ['Category', 'Target_Gender', 'Main_Color', 'Main_Metal', 'Main_Gem']
    numerical_features = ['Brand_ID']

    cat_pipeline = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    num_pipeline = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', num_pipeline, numerical_features),
            ('cat', cat_pipeline, categorical_features)
        ]
    )
    return preprocessor

def preprocess_data(X_train, X_test, preprocessor):
    X_train = preprocessor.fit_transform(X_train)
    X_test = preprocessor.transform(X_test)
    return X_train, X_test

def evaluate_model(y_test, preds, model_name):
    mae = mean_absolute_error(y_test, preds)
    mse = mean_squared_error(y_test, preds)
    r2 = r2_score(y_test, preds)
    return {"Model": model_name, "MAE": mae, "MSE": mse, "R2": r2}

def log_model_with_mlflow(model, model_name, metrics):
    with mlflow.start_run(run_name=model_name):
        mlflow.log_metrics(metrics)
        mlflow.sklearn.log_model(model, model_name)

# Main App
def main():
    st.title("Jewellery Price Optimization")
    st.sidebar.header("App Navigation")
    page = st.sidebar.radio("Choose a page", ["Overview", "Data Analysis", "Model Training and Evaluation"])

    # Load and Display Dataset
    @st.cache
    def load_data():
        df = pd.read_csv('jewellery_cleaned_data.csv')
        return df

    df = load_data()

    if page == "Overview":
        st.subheader("Project Overview")
        st.write("""
        This application optimizes the pricing of jewellery products using machine learning models, 
        integrated with MLflow for tracking model performance.
        """)

        st.subheader("Dataset Preview")
        st.dataframe(df.head())

        st.subheader("Dataset Information")
        st.write(f"Shape: {df.shape}")
        st.write(df.info())

    elif page == "Data Analysis":
        st.subheader("Exploratory Data Analysis")

        # Category Distribution
        st.write("### Category Distribution by Gender")
        fig, ax = plt.subplots(1, 2, figsize=(16, 6))
        sns.barplot(data=df, x='Category', y='Count', hue='Target_Gender', ax=ax[0], palette="viridis")
        ax[0].set_title("Bar Chart: Jewelry Category by Gender")

        category_counts = df['Category'].value_counts()
        ax[1].pie(category_counts.values, labels=category_counts.index, autopct='%1.1f%%', startangle=140, 
                  colors=sns.color_palette("viridis", len(category_counts)))
        ax[1].set_title("Pie Chart: Jewelry Category Distribution")

        st.pyplot(fig)

    elif page == "Model Training and Evaluation":
        st.subheader("Model Training and Evaluation")

        # Split Data
        target_column = 'Price_USD'
        X_train, X_test, y_train, y_test = split_data(df, target_column)

        # Preprocess Data
        preprocessor = create_pipeline()
        X_train, X_test = preprocess_data(X_train, X_test, preprocessor)

        # Models and Fixed Parameters
        models = [
            (RandomForestRegressor(random_state=42, n_estimators=200, max_depth=30, min_samples_split=2), "Random Forest"),
            (XGBRegressor(random_state=42, learning_rate=0.2, max_depth=10, n_estimators=300), "XGBoost"),
            (GradientBoostingRegressor(random_state=42, learning_rate=0.2, max_depth=7, n_estimators=300), "Gradient Boosting"),
            (LGBMRegressor(random_state=42, max_depth=31, learning_rate=0.3, n_estimators=300), "LightGBM")
        ]

        # Train and Evaluate Models
        results = []
        for model, model_name in models:
            model.fit(X_train, y_train)
            preds = model.predict(X_test)
            metrics = evaluate_model(y_test, preds, model_name)
            log_model_with_mlflow(model, model_name, metrics)
            results.append(metrics)

        # Display Results
        st.write("### Model Performance Metrics")
        results_df = pd.DataFrame(results)
        st.dataframe(results_df)

        st.write("### Actual vs Predicted Plot")
        fig, ax = plt.subplots(figsize=(12, 6))
        for model, model_name in models:
            preds = model.predict(X_test)
            ax.scatter(y_test, preds, alpha=0.5, label=model_name)
        ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color="red", linestyle="--", linewidth=2)
        ax.set_xlabel("Actual Prices")
        ax.set_ylabel("Predicted Prices")
        ax.set_title("Actual vs Predicted Prices")
        ax.legend()
        st.pyplot(fig)

# Run the App
if __name__ == "__main__":
    main()


ModuleNotFoundError: No module named 'streamlit'