In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np # Make sure numpy is imported

# Load the uploaded CSV file
try:
    df_full = pd.read_csv("data/full.csv")
    df_full['time'] = pd.to_datetime(df_full['time'])
    print(f"Data loaded successfully. Shape: {df_full.shape}")
    print(f"Time range: {df_full['time'].min()} to {df_full['time'].max()}")
    print(f"Columns: {df_full.columns.tolist()}")

    # --- EDA Steps ---

    # 1. Descriptive Statistics (already seen, but good to have in one place)
    print("\n--- Descriptive Statistics ---")
    print(df_full.describe())

    # 2. Distribution of Target Variable (spei) and Key Predictors
    # Based on your previous selection and data inspection:
    target_variable = 'spei'
    # Let's pick a few key predictors plus 'pre' (precipitation) as it's often related to drought.
    predictor_cols_for_dist_analysis = ['tmp', 'pre', 'vap', 'dtr', 'soi', 'cld']
    
    cols_to_analyze = [target_variable] + [col for col in predictor_cols_for_dist_analysis if col in df_full.columns]
    
    print(f"\n--- Analyzing distributions for: {cols_to_analyze} ---")

    # Check if all selected columns are present
    actual_cols_to_analyze = [col for col in cols_to_analyze if col in df_full.columns]
    if len(actual_cols_to_analyze) < len(cols_to_analyze):
        missing_cols_dist = set(cols_to_analyze) - set(actual_cols_to_analyze)
        print(f"Warning: The following columns for distribution EDA are not in the DataFrame and will be skipped: {missing_cols_dist}")
    
    if not actual_cols_to_analyze:
        print("No valid columns found for distribution analysis. Skipping this EDA step.")
    else:
        # Histograms
        num_cols_hist = len(actual_cols_to_analyze)
        num_rows_hist = (num_cols_hist + 1) // 2 
        plt.figure(figsize=(15, num_rows_hist * 4 )) # Adjusted figure size
        for i, col in enumerate(actual_cols_to_analyze):
            plt.subplot(num_rows_hist, 2, i + 1)
            sns.histplot(df_full[col], kde=True, bins=50) # Added bins
            plt.title(f'Histogram of {col}')
            plt.xlabel(col)
            plt.ylabel('Frequency')
        plt.tight_layout(pad=2.0) # Added padding
        plt.savefig("histograms_eda.png")
        print("Histograms saved to histograms_eda.png")
        plt.close() # Close the figure

        # Box plots
        plt.figure(figsize=(15, num_rows_hist * 4)) # Adjusted figure size
        for i, col in enumerate(actual_cols_to_analyze):
            plt.subplot(num_rows_hist, 2, i + 1)
            sns.boxplot(y=df_full[col])
            plt.title(f'Box Plot of {col}')
            plt.ylabel(col)
        plt.tight_layout(pad=2.0) # Added padding
        plt.savefig("boxplots_eda.png")
        print("Box plots saved to boxplots_eda.png")
        plt.close() # Close the figure

    # 3. Time Series Behavior of SPEI (mean across all locations)
    if target_variable in df_full.columns and 'time' in df_full.columns:
        print("\n--- Analyzing time series of mean SPEI ---")
        # Ensure 'time' is the index for resampling, or use groupby if not.
        # df_full_indexed_time = df_full.set_index('time')
        # mean_spei_over_time = df_full_indexed_time.groupby(pd.Grouper(freq='M'))[target_variable].mean().dropna()
        # Simpler approach using existing groupby:
        mean_spei_over_time = df_full.groupby('time')[target_variable].mean().dropna()


        plt.figure(figsize=(15, 6))
        mean_spei_over_time.plot()
        plt.title(f'Mean {target_variable.upper()} Over Time (Averaged Across All Locations)')
        plt.xlabel('Time')
        plt.ylabel(f'Mean {target_variable.upper()}')
        plt.grid(True)
        plt.tight_layout()
        plt.savefig(f"mean_{target_variable}_timeseries_eda.png")
        print(f"Mean {target_variable} time series plot saved to mean_{target_variable}_timeseries_eda.png")
        plt.close() # Close the figure
    else:
        print(f"Could not generate {target_variable} time series plot: '{target_variable}' or 'time' column missing.")

    # 4. Correlation Matrix (for selected predictors and target)
    # Predictors identified in our config discussion (excluding other targets for now)
    potential_predictors_for_corr = ['tmp', 'dtr', 'cld', 'tmx', 'tmn', 'wet', 'vap', 'soi', 'dmi', 'pdo', 'nino4', 'nino34', 'nino3', 'pre', 'pet']
    
    valid_predictors_for_corr = [col for col in potential_predictors_for_corr if col in df_full.columns]
    
    if target_variable in df_full.columns and valid_predictors_for_corr:
        cols_for_corr_analysis = [target_variable] + valid_predictors_for_corr
        print(f"\n--- Analyzing correlation matrix for: {cols_for_corr_analysis} ---")
        
        numeric_df_for_corr = df_full[cols_for_corr_analysis].select_dtypes(include=np.number)
        if len(numeric_df_for_corr.columns) != len(cols_for_corr_analysis):
            missing_corr_cols = set(cols_for_corr_analysis) - set(numeric_df_for_corr.columns)
            print(f"Warning: Some columns selected for correlation are not numeric and will be excluded: {missing_corr_cols}")
        
        if not numeric_df_for_corr.empty and target_variable in numeric_df_for_corr.columns:
            correlation_matrix = numeric_df_for_corr.corr()
            
            plt.figure(figsize=(14, 12)) # Adjusted size
            sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5, annot_kws={"size": 8}) # Smaller annotations
            plt.title(f'Correlation Matrix of {target_variable.upper()} and Potential Predictors')
            plt.xticks(rotation=45, ha='right', fontsize=9) # Fontsize
            plt.yticks(rotation=0, fontsize=9) # Fontsize
            plt.tight_layout(pad=2.0)
            plt.savefig("correlation_matrix_eda.png")
            print("Correlation matrix heatmap saved to correlation_matrix_eda.png")
            plt.close() # Close the figure
            
            print(f"\nCorrelation with {target_variable.upper()}:")
            print(correlation_matrix[target_variable].sort_values(ascending=False))
        else:
            print("No numeric columns or target variable not found for correlation analysis.")
            
    else:
        print(f"Could not generate correlation matrix: '{target_variable}' or predictor columns missing/invalid.")

    print("\nEDA Script finished.")

except FileNotFoundError:
    print(f"ERROR: The file 'full.csv' was not found. Please ensure it has been uploaded correctly.")
except ImportError as e:
    print(f"ERROR: A required library is not installed: {e}")
except Exception as e:
    print(f"An error occurred during EDA: {e}")
    import traceback 
    print(traceback.format_exc())

Data loaded successfully. Shape: (264201, 19)
Time range: 1901-01-16 00:00:00 to 2023-12-16 00:00:00
Columns: ['lon', 'lat', 'time', 'tmp', 'dtr', 'cld', 'tmx', 'tmn', 'pre', 'wet', 'vap', 'spei', 'soi', 'dmi', 'pdo', 'nino4', 'nino34', 'nino3', 'pet']

--- Descriptive Statistics ---
                 lon            lat                           time  \
count  264201.000000  264201.000000                         264201   
mean      100.909193      14.979038  1962-07-01 13:23:44.883327424   
min        97.750000       6.250000            1901-01-16 00:00:00   
25%        99.250000      13.750000            1931-09-16 00:00:00   
50%       100.250000      15.750000            1962-07-16 00:00:00   
75%       102.250000      17.250000            1993-04-16 00:00:00   
max       105.250000      20.250000            2023-12-16 00:00:00   
std         1.897804       3.368135                            NaN   

                 tmp            dtr            cld            tmx  \
count  264201.0