In [None]:
import streamlit as st
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Set up Streamlit layout to use full screen width
st.set_page_config(layout="wide")

# App Title
st.title("CROP YIELD PREDICTION APP")

# Introductory markdown explaining the app's purpose
st.markdown("""
Welcome to the Crop Yield Prediction App!  
This tool guides you from dataset upload to model prediction through interactive analysis, cleaning, and visualization.  
Upload your dataset or use a sample to begin your journey!
""")

# Sidebar navigation options for different stages of the app
st.sidebar.title("Navigation")
options = st.sidebar.radio("Select Step:", 
                          ["Upload Data", "Data Cleaning", "EDA", 
                           "Visualization", "Prediction", "Insights"])

# Initialize Streamlit session state variables to persist data across steps
for key in ['df', 'cleaned_df', 'target', 'model_type', 'model', 'report']:
    if key not in st.session_state:
        st.session_state[key] = None

# Define the required columns expected in the uploaded dataset
expected_columns = [
    "Area", 
    "Item", 
    "Year", 
    "hg/ha_yield", 
    "average_rain_fall_mm_per_year", 
    "pesticides_tonnes", 
    "avg_temp"
]

# Step 1: Upload or Load Dataset
if options == "Upload Data":
    st.header("Upload or Select Dataset")

    df = None  # Initialize an empty DataFrame variable to avoid reference issues

    # Provide user with two options: upload own dataset or use a default one
    data_source = st.radio("Choose a data source:", ['Upload your dataset', 'Use default dataset'])

    # CASE 1: Uploading a custom dataset
    if data_source == 'Upload your dataset':
        uploaded_file = st.file_uploader("Upload CSV or Excel file", type=["csv", "xlsx"])

        # Process uploaded file if provided
        if uploaded_file:
            try:
                # Read based on file type
                if uploaded_file.name.endswith(".csv"):
                    df = pd.read_csv(uploaded_file)
                else:
                    df = pd.read_excel(uploaded_file)

                st.success("File uploaded and read successfully.")
            except Exception as e:
                st.error(f"Error reading file: {e}")
                df = None  # Prevent later usage if file fails

    # CASE 2: Load default dataset from project directory
    else:
        try:
            df = pd.read_csv("yield_df.csv")  # Ensure this file is available in the working folder
            st.success("Default dataset loaded successfully.")
        except FileNotFoundError:
            st.error("Default dataset not found in the directory.")
            df = None

        # Provide sample structure for download to guide new users
        st.markdown("Don't have a dataset? [Download Example CSV](https://raw.githubusercontent.com/datasciencedojo/datasets/master/Agricultural%20Production.csv)")
        
        # Provide downloadable blank template CSV with only column headers
        st.download_button(
            label="Download Example Dataset",
            data=pd.DataFrame(columns=expected_columns).to_csv(index=False),
            file_name='example_crop_data.csv',
            mime='text/csv'
        )

    # Proceed with data inspection if a dataset is successfully loaded
    if df is not None:
        # Check for missing expected columns in uploaded/default data
        missing_columns = [col for col in expected_columns if col not in df.columns]

        if missing_columns:
            # Display error if required columns are missing
            st.error("Dataset is missing the following required columns:")
            st.write(missing_columns)
        else:
            # Check and drop any extra columns not needed
            extra_columns = [col for col in df.columns if col not in expected_columns]
            if extra_columns:
                df = df[expected_columns]  # Keep only necessary columns
                st.warning(f"Extra columns dropped: {extra_columns}")

            # Store valid DataFrame in session state for reuse in other steps
            st.session_state.df = df

            # Display feedback and data summaries
            st.success("Dataset is valid and ready for analysis!")

            # Data preview (first 5 rows)
            st.subheader("Data Preview")
            st.dataframe(df.head())

            # Show data types of all columns
            st.subheader("Data Types")
            st.dataframe(pd.DataFrame(df.dtypes, columns=["Data Type"]))

            # Basic info: number of rows and columns
            st.subheader("Dataset Overview")
            st.write(f"Rows: {df.shape[0]} | Columns: {df.shape[1]}")

            # Descriptive statistics for all columns
            st.subheader("Descriptive Statistics")
            st.dataframe(df.describe(include='all'))

            # Missing values summary
            st.subheader("Missing Values")
            missing_df = pd.DataFrame(df.isna().sum(), columns=['Missing Values'])
            missing_df["Percentage"] = (missing_df['Missing Values'] / len(df)) * 100
            st.dataframe(missing_df)

            # Count and show number of duplicate rows
            st.subheader("Duplicate Rows")
            st.write(f"Number of duplicate rows: {df.duplicated().sum()}")
            
# Step 2: Data Cleaning
elif options == "Data Cleaning":
    st.header("Data Cleaning")

    # Ensure a dataset is loaded before proceeding
    if st.session_state.df is not None:
        df = st.session_state.df

        st.subheader("Data Issues Detected")

        issues = []  # List to hold all detected data quality issues

        # 1. Check for Missing Values
        total_missing = df.isna().sum().sum()
        if total_missing > 0:
            issues.append(f"Missing values detected: {total_missing} total")

        # 2. Check for Duplicate Rows
        duplicates = df.duplicated().sum()
        if duplicates > 0:
            issues.append(f"Duplicate rows detected: {duplicates}")

        # 3. Outlier Detection using IQR method
        numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
        outlier_cols = []  # Keep track of columns that contain outliers

        for col in numeric_cols:
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            outliers = df[(df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))]

            if not outliers.empty:
                issues.append(f"Potential outliers detected in '{col}'")
                outlier_cols.append(col)

        # Visualize detected outliers using boxplots
        if outlier_cols:
            st.write("The following columns have potential outliers:")
            for col in outlier_cols:
                fig, ax = plt.subplots(figsize=(5, 3))
                sns.boxplot(df[col], color='skyblue', ax=ax)
                ax.set_title(f'Outliers in {col}', fontsize=12)
                st.pyplot(fig)

        # 4. Check for numeric data stored as text
        for col in df.select_dtypes(include='object').columns:
            try:
                pd.to_numeric(df[col])  # Attempt conversion
                issues.append(f"Column '{col}' contains numeric data stored as text")
            except:
                pass  # If conversion fails, ignore

        # Display detected issues
        if issues:
            st.warning("The following data issues were found:")
            for issue in issues:
                st.markdown(f"- {issue}")
        else:
            st.success("No major data issues detected!")


        # Data Cleaning Interface
        st.subheader("Data Cleaning Options")

        # Let user choose multiple cleaning actions
        cleaning_options = st.multiselect(
            "Select cleaning actions to apply:",
            [
                "Remove duplicate rows",
                "Fill missing values (numeric)",
                "Fill missing values (categorical)",
                "Remove rows with missing values",
                "Remove columns with high missing values (>30%)",
                "Convert text to numeric where possible",
                "Remove outliers (for numeric columns)",
                "Standardize column names"
            ]
        )

        # Button to apply selected cleaning steps
        if st.button("Clean Data"):
            cleaned_df = df.copy()  # Work on a copy of the dataset

            # Remove duplicates
            if "Remove duplicate rows" in cleaning_options:
                cleaned_df = cleaned_df.drop_duplicates()

            # Fill missing numeric values with column mean
            if "Fill missing values (numeric)" in cleaning_options:
                from sklearn.impute import SimpleImputer
                numeric_cols = cleaned_df.select_dtypes(include=['int64', 'float64']).columns
                imputer = SimpleImputer(strategy='mean')
                cleaned_df[numeric_cols] = imputer.fit_transform(cleaned_df[numeric_cols])

            # Fill missing categorical values with mode
            if "Fill missing values (categorical)" in cleaning_options:
                cat_cols = cleaned_df.select_dtypes(include=['object']).columns
                for col in cat_cols:
                    if cleaned_df[col].isnull().sum() > 0:
                        cleaned_df[col].fillna(cleaned_df[col].mode()[0], inplace=True)

            # Drop any row that still contains missing values
            if "Remove rows with missing values" in cleaning_options:
                cleaned_df.dropna(inplace=True)

            # Drop columns where more than 30% of values are missing
            if "Remove columns with high missing values (>30%)" in cleaning_options:
                threshold = len(cleaned_df) * 0.3
                cleaned_df.dropna(axis=1, thresh=threshold, inplace=True)

            # Convert any column with numeric text to actual numeric type
            if "Convert text to numeric where possible" in cleaning_options:
                for col in cleaned_df.columns:
                    if cleaned_df[col].dtype == 'object':
                        try:
                            cleaned_df[col] = pd.to_numeric(cleaned_df[col])
                        except:
                            pass  # Ignore conversion errors

            # Remove outliers using the 3-standard-deviation rule
            if "Remove outliers (for numeric columns)" in cleaning_options:
                for col in cleaned_df.select_dtypes(include=['int64', 'float64']).columns:
                    mean = cleaned_df[col].mean()
                    std = cleaned_df[col].std()
                    cleaned_df = cleaned_df[
                        (cleaned_df[col] <= mean + 3 * std) &
                        (cleaned_df[col] >= mean - 3 * std)
                    ]

            # Rename all columns to lowercase with underscores (standard format)
            if "Standardize column names" in cleaning_options:
                cleaned_df.columns = cleaned_df.columns.str.lower().str.strip().str.replace(" ", "_")

            # Save cleaned data to session state for reuse in later steps
            st.session_state.cleaned_df = cleaned_df

            # Display cleaning result
            st.success("Data cleaning completed successfully!")

            # Preview cleaned dataset
            st.subheader("Cleaned Data Preview")
            st.dataframe(cleaned_df.head())

            # Cleaning summary statistics
            st.subheader("Cleaning Summary")
            st.write(f"Original shape: {df.shape}")
            st.write(f"New shape: {cleaned_df.shape}")
            st.write(f"Rows removed: {df.shape[0] - cleaned_df.shape[0]}")
            st.write(f"Columns removed: {df.shape[1] - cleaned_df.shape[1]}")

    else:
        # Message shown if user tries to access this step before uploading data
        st.warning("Please upload a dataset first in the 'Upload Data' section.")


# # STEP 3: Data Visualization
# elif options == "Data Visualization":
#     st.header("Data Visualization")

#     if st.session_state.cleaned_df is not None:
#         df = st.session_state.cleaned_df
        
# st.subheader("📈 Exploratory Data Analysis")
# st.write("Select columns to visualize")

# numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
# selected_col = st.selectbox("Choose a numeric column", numeric_cols)

# if selected_col:
#     fig, ax = plt.subplots()
#     sns.histplot(df[selected_col], kde=True, ax=ax)
#     st.pyplot(fig)

# # STEP 4: Preprocessing
# st.subheader("⚙️ Data Preprocessing")

# target_col = st.selectbox("🎯 Select Target Column", df.columns)
# X = df.drop(columns=[target_col])
# y = df[target_col]

# # Encode categorical features
# for col in X.select_dtypes(include="object").columns:
#     le = LabelEncoder()
#     X[col] = le.fit_transform(X[col])

# # Encode target if needed
# if y.dtype == 'object':
#     y = LabelEncoder().fit_transform(y)

# # Scaling features
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)

# st.success("✅ Preprocessing complete!")

# # STEP 5: Model Training
# st.subheader("🤖 Model Development")

# test_size = st.slider("Select test size", 0.1, 0.5, 0.2)
# random_state = st.number_input("Random state (for reproducibility)", value=42)

# X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=test_size, random_state=int(random_state))

# model = RandomForestClassifier()
# model.fit(X_train, y_train)

# y_pred = model.predict(X_test)

# # STEP 6: Evaluation
# st.subheader("📋 Model Evaluation")
# st.write("Accuracy:", accuracy_score(y_test, y_pred))
# st.text("Classification Report:")
# st.text(classification_report(y_test, y_pred))

# # Predict with user input
# st.subheader("📝 Make a Prediction")
# input_data = {}
# for col in df.drop(columns=[target_col]).columns:
#     value = st.text_input(f"Enter value for {col}")
#     input_data[col] = value

# if st.button("Predict"):
#     input_df = pd.DataFrame([input_data])

#     for col in input_df.columns:
#         if input_df[col].dtype == 'object':
#             input_df[col] = LabelEncoder().fit(df[col]).transform(input_df[col])

#     input_df_scaled = scaler.transform(input_df)
#     prediction = model.predict(input_df_scaled)
#     st.success(f"🎉 Predicted class: {prediction[0]}")
