In [3]:
import streamlit as st
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Set wide layout
st.set_page_config(layout="wide")
sns.set(style="whitegrid")

# Load dataset
df = pd.read_csv("yield_df.csv")
df.drop('Unnamed: 0', axis=1, inplace=True)
df.rename(columns={'Item':'Crop'}, inplace=True)

# Sidebar filters
st.sidebar.header("Filters")
selected_country = st.sidebar.selectbox("Select a Country", sorted(df['Area'].unique()))
selected_crop = st.sidebar.selectbox("Select a Crop", sorted(df['Crop'].unique()))

# Title
st.title("🌾 Crop Yield & Environmental Dashboard")

# Section 1: Pesticide Usage
st.header("1️⃣ Pesticide Usage Overview")

with st.expander("Top Countries by Pesticide Use"):
    pesticide_by_country = df.groupby('Area')['pesticides_tonnes'].sum().sort_values(ascending=False).head(10)
    fig, ax = plt.subplots()
    sns.barplot(x=pesticide_by_country.values, y=pesticide_by_country.index, palette="Reds_r", ax=ax)
    ax.set_title("Top 10 Countries by Pesticide Use")
    ax.set_xlabel("Total Pesticides Used (tonnes)")
    ax.set_ylabel("Country")
    st.pyplot(fig)

with st.expander("Global Pesticide Use Over Time"):
    pesticide_by_year = df.groupby('Year')['pesticides_tonnes'].sum()
    fig, ax = plt.subplots()
    sns.lineplot(x=pesticide_by_year.index, y=pesticide_by_year.values, marker='o', ax=ax)
    ax.set_title("Pesticide Use Over Time")
    ax.set_xlabel("Year")
    ax.set_ylabel("Pesticide Tonnes")
    st.pyplot(fig)

# Section 2: Yield & Environment
st.header("2️⃣ Yield vs Environment")

with st.expander("Effect of Pesticide on Crop Yield"):
    fig, ax = plt.subplots()
    sns.scatterplot(data=df, x='pesticides_tonnes', y='hg/ha_yield', hue='Crop', ax=ax)
    ax.set_title("Pesticide vs Crop Yield")
    ax.set_xlabel("Pesticides (tonnes)")
    ax.set_ylabel("Yield (hg/ha)")
    st.pyplot(fig)

with st.expander("Effect of Temperature on Yield"):
    fig, ax = plt.subplots()
    sns.scatterplot(data=df, x='avg_temp', y='hg/ha_yield', hue='Crop', ax=ax)
    ax.set_title("Temperature vs Yield")
    ax.set_xlabel("Temperature (°C)")
    ax.set_ylabel("Yield (hg/ha)")
    st.pyplot(fig)

# Section 3: Rainfall & Temperature
st.header("3️⃣ Rainfall and Temperature")

col1, col2 = st.columns(2)

with col1:
    st.subheader("Average Temperature by Year")
    avg_temp_year = df.groupby('Year')['avg_temp'].mean()
    fig, ax = plt.subplots()
    sns.lineplot(x=avg_temp_year.index, y=avg_temp_year.values, marker='o', ax=ax)
    ax.set_title("Avg. Temperature Over Years")
    ax.set_xlabel("Year")
    ax.set_ylabel("Temperature (°C)")
    st.pyplot(fig)

with col2:
    st.subheader("Average Rainfall by Year")
    avg_rain_year = df.groupby('Year')['average_rain_fall_mm_per_year'].mean()
    fig, ax = plt.subplots()
    sns.lineplot(x=avg_rain_year.index, y=avg_rain_year.values, marker='o', ax=ax)
    ax.set_title("Avg. Rainfall Over Years")
    ax.set_xlabel("Year")
    ax.set_ylabel("Rainfall (mm)")
    st.pyplot(fig)

# Section 4: Country-specific Yield Trends
st.header("4️⃣ Country-Specific Yield Trends")

filtered = df[df['Area'] == selected_country]
fig, ax = plt.subplots()
sns.lineplot(data=filtered, x='Year', y='hg/ha_yield', hue='Crop', marker='o', ax=ax)
ax.set_title(f"Crop Yield Over Time in {selected_country}")
ax.set_xlabel("Year")
ax.set_ylabel("Yield (hg/ha)")
st.pyplot(fig)

# Section 5: Correlation Heatmap
st.header("5️⃣ Correlation Heatmap")

st.write("Explore correlations between environmental factors and crop yield.")
corr_df = df[['hg/ha_yield', 'pesticides_tonnes', 'average_rain_fall_mm_per_year', 'avg_temp']].dropna()
fig, ax = plt.subplots()
sns.heatmap(corr_df.corr(), annot=True, cmap="coolwarm", ax=ax)
ax.set_title("Correlation Matrix")
st.pyplot(fig)

# Section 6: Crop with Most Pesticide
st.header("6️⃣ Crops With Most Pesticide Used")
pesticide_by_crop = df.groupby('Crop')['pesticides_tonnes'].sum().sort_values(ascending=False).head(10)
fig, ax = plt.subplots()
sns.barplot(x=pesticide_by_crop.values, y=pesticide_by_crop.index, palette="YlOrBr", ax=ax)
ax.set_title("Top 10 Crops by Pesticide Use")
st.pyplot(fig)

# Section 7: Highlight Hottest Country
hottest_country = df.groupby('Area')['avg_temp'].mean().idxmax()
max_temp = df.groupby('Area')['avg_temp'].mean().max()
st.success(f"🔥 The hottest country on average is **{hottest_country}** with **{max_temp:.2f}°C**.")





DeltaGenerator()

In [None]:
# STEP 4: Preprocessing
elif options == "Preprocessing":
    st.header("Data Preprocessing")

    # Ensure the cleaned dataframe exists in session state
    if st.session_state.cleaned_df is not None:
        df = st.session_state.cleaned_df.copy()
        st.subheader("Encoding Categorical Features")
        # Identify categorical columns
        categorical_columns = df.select_dtypes(include="object").columns.tolist()
        # Encode them
        df = encode_categorical(df, categorical_columns)
        st.success("Categorical features encoded successfully!")
        st.dataframe(df.head())

        st.subheader("Correlation Analysis")
        # Compute correlation matrix
        correlation_matrix = df.corr()
        # Visualize correlation matrix
        fig, ax = plt.subplots(figsize=(7, 7))
        sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5, ax=ax)
        ax.set_title('Heatmap of Correlation Matrix', fontsize=18)
        st.pyplot(fig)

        # Check for highly correlated feature pairs (above or below ±0.5)
        high_corr_columns = find_high_correlation_pairs(df.corr(), threshold=0.5)
        st.subheader("Highly Correlated Feature Pairs ≥ ±0.5")

        if high_corr_columns:
            for col1, col2, corr_val in high_corr_columns:
                st.write(f"Correlation between `{col1}` and `{col2}` is `{corr_val:.2f}`")
        else:
            st.info("No highly correlated feature pairs found.")

        # Drop correlated features
        df, dropped_cols = drop_highly_correlated_features(df, correlation_matrix, threshold=0.5)

        # Display dropped columns if any
        if dropped_cols:
            st.subheader("Dropped Highly Correlated Columns")
            st.write(f"Columns dropped due to high correlation (>|0.5|): `{', '.join(dropped_cols)}`")
        else:
            st.info("No highly correlated features were dropped.")

        st.subheader("Target and Feature Separation")
        # Ensure target column exists
        if 'Hg/ha_yield' in df.columns:
            st.write("Target Column Selected: `'Hg/ha_yield'`")
            # Split into features and target
            X = df.drop('Hg/ha_yield', axis=1)
            Y = df['Hg/ha_yield']

            st.write("Feature Columns:")
            st.write(X.columns)
        else:
            st.error("Target column `'Hg/ha_yield'` not found. Please check your dataset.")
            st.stop()

       
        st.success("Features normalized successfully!")
        # Store processed features and target in session state
        st.session_state.X_processed = X
        st.session_state.Y_processed = Y

           # Define numerical columns (you might want to make this configurable)
        numerical_columns = ['Average_rain_fall_mm_per_year', 'Pesticides_tonnes', 'Avg_temp', 'Year']
        
        # Process data
        splits = process_data(X, Y, numerical_columns)
        
        # Store splits in session state
        st.session_state.data_splits = splits
        
        # Display information about splits
        st.subheader("Data Splits Created")
        for split_name, data in splits.items():
            if split_name != 'transfer_learning':
                st.write(f"**{split_name}**: Train size: {len(data['X_train'])}, Test size: {len(data['X_test'])}")
            else:
                st.write(f"**{split_name}**: Adaptation size: {len(data['X_adapt'])}, Final test size: {len(data['X_test_unseen_final'])}")
        
        st.success("Data processing complete! All splits are ready for modeling.")

    else:
        # Data not available warning
        st.warning("Please upload a dataset first in the 'Upload Data' section.")

# Fallback: Prevents error if accessed before cleaning
elif options == "Preprocessing" and st.session_state.cleaned_df is None:
    st.warning("Please clean your data first in the 'Data Cleaning' section.")


# # STEP 5: Model Training
# --- Modeling Section ---
elif options == "Modeling":
    st.subheader("Model Training and Transfer Learning")
    
    # Check if data is available
    if 'data_splits' not in st.session_state:
        st.warning("Please process data first in the Data Processing section.")
        st.stop()
        
    # Get data splits
    splits = st.session_state.data_splits
    
    # Prepare data for transfer learning
    X_train_final = splits['unseen_countries']['X_train']
    y_train_final = splits['unseen_countries']['y_train']
    sample_weights1 = splits['unseen_countries']['sample_weights']
    
    X_adapt = splits['transfer_learning']['X_adapt']
    y_adapt = splits['transfer_learning']['y_adapt']
    X_test_unseen_final = splits['transfer_learning']['X_test_unseen_final']
    y_test_unseen_final = splits['transfer_learning']['y_test_unseen_final']
    
    # Display data info
    col1, col2, col3 = st.columns(3)
    with col1:
        st.metric("Training Samples", len(X_train_final))
    with col2:
        st.metric("Adaptation Samples", len(X_adapt))
    with col3:
        st.metric("Test Samples", len(X_test_unseen_final))
    
    # Fixed best parameters from prototyping
    st.subheader("Model Parameters")
    
    # Random Forest parameters
    rf_params = {
        'n_estimators': 50,
        'max_depth': 10,
        'min_samples_split': 2,
        'random_state': 42}
    
    # XGBoost parameters
    xgb_params = {
        'n_estimators': 50,
        'max_depth': 6,
        'learning_rate': 0.5,
        'random_state': 42}
    
    st.write("**Random Forest Parameters**")
    st.write(f"- n_estimators: {rf_params['n_estimators']}")
    st.write(f"- max_depth: {rf_params['max_depth']}")
    st.write(f"- min_samples_split: {rf_params['min_samples_split']}")
    
    st.write("**XGBoost Parameters**")
    st.write(f"- n_estimators: {xgb_params['n_estimators']}")
    st.write(f"- max_depth: {xgb_params['max_depth']}")
    st.write(f"- learning_rate: {xgb_params['learning_rate']}")
    
    # Train source models
    if st.button("Train Source Models", type="primary"):
        st.info("Training source models with best parameters...")
        
        source_models = {}
        
        # Train Random Forest
        st.write("Training Random Forest...")
        rf_model = RandomForestRegressor(**rf_params)
        rf_model.fit(X_train_final, y_train_final, sample_weight=sample_weights1)
        source_models['Random Forest'] = rf_model
        st.success("Random Forest trained successfully!")
        
        # Train XGBoost
        st.write("Training XGBoost...")
        xgb_model = XGBRegressor(**xgb_params)
        xgb_model.fit(X_train_final, y_train_final, sample_weight=sample_weights1)
        source_models['XGBoost'] = xgb_model
        st.success("XGBoost trained successfully!")
        
        # Store source models in session state
        st.session_state.source_models = source_models
        st.success("All models trained successfully!")
        
        # Show source model performance
        st.subheader("Source Model Performance")
        
        for model_name, model in source_models.items():
            # Make predictions
            y_pred = model.predict(X_train_final)
            
            # Calculate metrics
            from sklearn.metrics import r2_score, mean_absolute_error, mean_absolute_percentage_error
            
            r2 = r2_score(y_train_final, y_pred)
            mae = mean_absolute_error(y_train_final, y_pred)
            mape = mean_absolute_percentage_error(y_train_final, y_pred) * 100
            
            st.write(f"**{model_name}**: R² = {r2:.4f}, MAE = {mae:.2f}, MAPE = {mape:.2f}%")
    
    # Run transfer learning experiments (only if source models are available)
    if 'source_models' in st.session_state and st.session_state.source_models:
        st.subheader("Transfer Learning Experiments")
        
        if st.button("Run Transfer Learning Experiments", type="primary"):
            st.info("Running transfer learning experiments. This may take a while...")
            
            # Prepare models configuration
            models_config = []
            source_models = st.session_state.source_models
            
            # Add Random Forest
            models_config.append((
                'Random Forest', 
                source_models['Random Forest'], 
                RandomForestRegressor(**rf_params)
            ))
            
            # Add XGBoost
            models_config.append((
                'XGBoost', 
                source_models['XGBoost'], 
                XGBRegressor(**xgb_params)
            ))
            
            # Run experiments
            results_df, transfer_models = run_all_transfer_experiments(
                models_config, 
                X_train_final, y_train_final, 
                X_adapt, y_adapt, 
                X_test_unseen_final, y_test_unseen_final, 
                sample_weights1
            )
            
            # Store results and models
            st.session_state.transfer_results = results_df
            st.session_state.transfer_models = transfer_models
            
            # Display results
            st.subheader("Transfer Learning Results")
            st.dataframe(results_df.style.format({
                'No Transfer R²': '{:.4f}',
                'Transfer R²': '{:.4f}',
                'R² Improvement (pp)': '{:.2f}',
                'R² Improvement (%)': '{:.2f}',
                'No Transfer MAE': '{:.2f}',
                'Transfer MAE': '{:.2f}',
                'MAE Improvement': '{:.2f}',
                'MAE Reduction (%)': '{:.2f}',
                'No Transfer MAPE (%)': '{:.2f}',
                'Transfer MAPE (%)': '{:.2f}',
                'MAPE Improvement (%)': '{:.2f}',
                'MAPE Reduction (%)': '{:.2f}'
            }))
            
            # Show best model
            best_model_idx = results_df['Transfer R²'].idxmax()
            best_model = results_df.loc[best_model_idx, 'Model']
            st.success(f"Best performing model: {best_model} (R²: {results_df.loc[best_model_idx, 'Transfer R²']:.4f})")
            
            # Visualizations in expanders
            with st.expander("Model Performance Visualizations", expanded=True):
                st.subheader("Model Performance Comparison")
                
                # R² Improvement
                fig_r2_improvement = plot_metric_comparison(
                    results_df, 'R² Improvement (%)', 
                    'Percentage R² Improvement from Transfer Learning', 
                    'R² Improvement (%)', 'viridis'
                )
                st.pyplot(fig_r2_improvement)
                
                # MAE Reduction
                fig_mae_reduction = plot_metric_comparison(
                    results_df, 'MAE Reduction (%)', 
                    'Percentage MAE Reduction from Transfer Learning', 
                    'MAE Reduction (%)', 'viridis'
                )
                st.pyplot(fig_mae_reduction)
                
                # MAPE Reduction
                fig_mape_reduction = plot_metric_comparison(
                    results_df, 'MAPE Reduction (%)', 
                    'Percentage MAPE Reduction from Transfer Learning', 
                    'MAPE Reduction (%)', 'viridis'
                )
                st.pyplot(fig_mape_reduction)
                
                # Before/After Comparisons
                st.subheader("Before and After Transfer Learning")
                
                # R² Comparison
                fig_r2_comparison = plot_before_after_comparison(
                    results_df, 'No Transfer R²', 'Transfer R²',
                    'R² Comparison: Transfer Learning vs No Transfer',
                    'R² Score'
                )
                st.pyplot(fig_r2_comparison)
                
                # MAE Comparison
                fig_mae_comparison = plot_before_after_comparison(
                    results_df, 'No Transfer MAE', 'Transfer MAE',
                    'MAE Comparison: Transfer Learning vs No Transfer',
                    'MAE'
                )
                st.pyplot(fig_mae_comparison)
                
                # MAPE Comparison
                fig_mape_comparison = plot_before_after_comparison(
                    results_df, 'No Transfer MAPE (%)', 'Transfer MAPE (%)',
                    'MAPE Comparison: Transfer Learning vs No Transfer',
                    'MAPE (%)'
                )
                st.pyplot(fig_mape_comparison)
            
            # Feature Importance
            with st.expander("Feature Importance Analysis", expanded=False):
                st.subheader("Feature Importance")
                
                # For tree-based models
                if 'Random Forest' in transfer_models:
                    rf_model = transfer_models['Random Forest']
                    feature_names = X_train_final.columns.tolist()
                    # Add source_pred feature for transfer models
                    feature_names.append('source_pred')
                    
                    fig_rf_importance = plot_feature_importance(
                        rf_model, feature_names, 'Random Forest'
                    )
                    st.pyplot(fig_rf_importance)
                
                if 'XGBoost' in transfer_models:
                    xgb_model = transfer_models['XGBoost']
                    feature_names = X_train_final.columns.tolist()
                    # Add source_pred feature for transfer models
                    feature_names.append('source_pred')
                    
                    fig_xgb_importance = plot_feature_importance(
                        xgb_model, feature_names, 'XGBoost'
                    )
                    st.pyplot(fig_xgb_importance)
            
            # Individual Model Performance
            with st.expander("Individual Model Performance", expanded=False):
                st.subheader("Detailed Model Performance")
                
                for model_name in transfer_models:
                    # Get predictions for this model
                    model = transfer_models[model_name]
                    
                    # Create enhanced test set for transfer model
                    X_test_transfer = X_test_unseen_final.copy()
                    source_model = source_models[model_name]
                    X_test_transfer['source_pred'] = source_model.predict(X_test_unseen_final)
                    
                    # Get predictions
                    y_pred = model.predict(X_test_transfer)
                    
                    # Plot performance
                    fig_performance = plot_model_performance(
                        y_test_unseen_final, y_pred, model_name
                    )
                    st.pyplot(fig_performance)
            
            # Download results
            csv = results_df.to_csv(index=False)
            st.download_button(
                label="Download Results as CSV",
                data=csv,
                file_name="transfer_learning_results.csv",
                mime="text/csv"
            )
    else:
        st.info("Please train source models first before running transfer learning experiments.")
