<a href="https://colab.research.google.com/github/KashifAliLashari/datalysis/blob/main/datalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import streamlit as st
import sys
import importlib
import io

st.set_page_config(page_title="Dataset Visualizer")

st.title("Dataset Visualizer")

# Debug information
st.sidebar.title("Debug Info")
st.sidebar.write(f"Python version: {sys.version}")

def import_and_log(module_name):
    try:
        module = importlib.import_module(module_name)
        if hasattr(module, '__version__'):
            st.sidebar.write(f"{module_name} version: {module.__version__}")
        elif module_name == 'matplotlib.pyplot':
            import matplotlib
            st.sidebar.write(f"matplotlib version: {matplotlib.__version__}")
        else:
            st.sidebar.write(f"{module_name} version: unknown")
        return module
    except ImportError as e:
        st.error(f"Failed to import {module_name}: {str(e)}")
        st.stop()

np = import_and_log("numpy")
pd = import_and_log("pandas")
matplotlib = import_and_log("matplotlib")
plt = import_and_log("matplotlib.pyplot")
sns = import_and_log("seaborn")
px = import_and_log("plotly.express")

# Check NumPy and Pandas versions
if np.__version__ != "1.23.5" or pd.__version__ != "1.5.3":
    st.warning(f"Warning: You are using NumPy {np.__version__} and Pandas {pd.__version__}. "
               f"This app was tested with NumPy 1.23.5 and Pandas 1.5.3. "
               f"If you encounter issues, please try updating these libraries.")

# File uploader
uploaded_file = st.file_uploader("Choose a CSV file", type="csv")

# Function to check file size
def check_file_size(file):
    max_size = 5 * 1024 * 1024  # 5 MB
    file.seek(0, 2)
    file_size = file.tell()
    file.seek(0)
    if file_size > max_size:
        st.error(f"File size exceeds the limit of 5 MB. Your file is {file_size / 1024 / 1024:.2f} MB.")
        return False
    return True

# Enhanced data exploration function
def explore_dataset(df):
    """
    Comprehensive dataset exploration function
    Provides:
    1. Data Preview
    2. Column Details
    3. Missing Data Overview
    """
    # Data Preview
    st.subheader("📊 Dataset Preview")
    preview_type = "First 10 Rows"
    if preview_type == "First 10 Rows":
        st.dataframe(df.head(10))
    elif preview_type == "Last 10 Rows":
        st.dataframe(df.tail(10))
    else:
        st.dataframe(df)

    # Column Details
    st.subheader("📝 Column Details")
    column_details = pd.DataFrame({
        'Column Name': df.columns,
        'Data Type': df.dtypes,
        'Non-Null Count': df.count(),
        'Unique Values': [df[col].nunique() for col in df.columns],
        'Missing Values': df.isnull().sum()
    })
    st.dataframe(column_details)

    # Missing Data Overview
    st.subheader("❓ Missing Data Analysis")

    # Missing data percentage
    missing_percentages = (df.isnull().sum() / len(df) * 100).round(2)
    missing_df = pd.DataFrame({
        'Column': missing_percentages.index,
        'Missing %': missing_percentages.values
    }).sort_values('Missing %', ascending=False)

    st.dataframe(missing_df)

    # Missing data visualization
    if missing_df['Missing %'].sum() > 0:
        fig, ax = plt.subplots(figsize=(10, 6))
        missing_percentages.plot(kind='bar', ax=ax)
        plt.title('Missing Values Percentage by Column')
        plt.xlabel('Columns')
        plt.ylabel('Missing %')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        st.pyplot(fig)

# Main app logic
if uploaded_file is not None:
    if check_file_size(uploaded_file):
        try:
            # Read the CSV file
            df = pd.read_csv(uploaded_file)

            # Display basic information about the dataset
            st.write("Dataset Info:")
            st.write(f"Number of rows: {df.shape[0]}")
            st.write(f"Number of columns: {df.shape[1]}")

            # Automatically generate data exploration insights
            if st.button("Explore Dataset"):
                explore_dataset(df)

            # Visualization options
            viz_option = st.radio(
                "Choose a visualization:",
                ("Data Overview", "Correlation Heatmap", "Distribution Plot", "Scatter Plot")
            )

            # Default numeric columns selection
            numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns

            if viz_option in ["Distribution Plot", "Scatter Plot"]:
                x_col = st.selectbox("Select X-axis:", numeric_cols, index=0)
                y_col = st.selectbox("Select Y-axis:", numeric_cols, index=1 if len(numeric_cols) > 1 else 0)
                submit = st.button("Submit")

                if submit:
                    if viz_option == "Distribution Plot":
                        fig = px.histogram(df, x=x_col, y=y_col, marginal="box")
                        st.plotly_chart(fig)

                    elif viz_option == "Scatter Plot":
                        fig = px.scatter(df, x=x_col, y=y_col, trendline="ols")
                        st.plotly_chart(fig)

            elif viz_option == "Data Overview":
                if st.button("Generate Visualization"):
                    st.write(df.describe())
                    fig, ax = plt.subplots(figsize=(10, 6))
                    df.plot(kind='box', ax=ax)
                    st.pyplot(fig)

            elif viz_option == "Correlation Heatmap":
                if st.button("Generate Visualization"):
                    fig, ax = plt.subplots(figsize=(10, 8))
                    sns.heatmap(df.corr(), annot=True, cmap='coolwarm', ax=ax)
                    st.pyplot(fig)

            # Download option
            buf = io.BytesIO()
            plt.savefig(buf, format='png')
            btn = st.download_button(
                label="Download Visualization",
                data=buf,
                file_name="visualization.png",
                mime="image/png"
            )
        except Exception as e:
            st.error(f"An error occurred: {str(e)}")
            st.error("Please check your CSV file and try again.")
else:
    st.write("Please upload a CSV file to begin.")