In [None]:
'''
RESHAPE DATA (CRITICAL STEP) - COMPLETE CLEANING SOLUTION
'''

import pandas as pd
import numpy as np

def clean_and_reshape_maize_data(df):
    """
    Clean and reshape the maize production data from the problematic Excel format
    """
    
    # Step 1: Remove the first two rows (header rows) as they contain metadata
    # Keep only the actual data starting from row index 2
    data_df = df.iloc[2:].reset_index(drop=True).copy()
    
    # Step 2: Create proper column names based on the original header structure
    # The pattern is: County, then for each year: Area, Production, Yield
    years = [2012, 2013, 2014, 2015, 2016, 2017, 2018, 2020]  # Note: 2019 appears to be missing
    
    # Create column names
    column_names = ['County']
    for year in years:
        column_names.extend([f'Area_{year}', f'Production_{year}', f'Yield_{year}'])
    
    # Step 3: Assign proper column names
    data_df.columns = column_names
    
    # Step 4: Convert numeric columns to appropriate types
    for col in data_df.columns:
        if col != 'County':
            data_df[col] = pd.to_numeric(data_df[col], errors='coerce')
    
    # Step 5: Melt/reshape the data from wide to long format
    melted_data = []
    
    for year in years:
        temp_df = data_df[['County', f'Area_{year}', f'Production_{year}', f'Yield_{year}']].copy()
        temp_df = temp_df.rename(columns={
            f'Area_{year}': 'Area_HA',
            f'Production_{year}': 'Production_MT',
            f'Yield_{year}': 'Yield_MT_HA'
        })
        temp_df['Year'] = year
        melted_data.append(temp_df)
    
    # Combine all years
    clean_df = pd.concat(melted_data, ignore_index=True)
    
    # Step 6: Remove rows where all numeric values are NaN
    clean_df = clean_df.dropna(subset=['Area_HA', 'Production_MT', 'Yield_MT_HA'], how='all')
    
    # Step 7: Reset index
    clean_df = clean_df.reset_index(drop=True)
    
    # Step 8: Reorder columns
    clean_df = clean_df[['County', 'Year', 'Area_HA', 'Production_MT', 'Yield_MT_HA']]
    
    return clean_df, data_df

# Apply the cleaning function
clean_maize_df, wide_format_df = clean_and_reshape_maize_data(maize_df)

print("=" * 80)
print("CLEANED DATA PREVIEW (Long Format - Ready for Analysis)")
print("=" * 80)
print(f"Shape: {clean_maize_df.shape}")
print("\nFirst 10 rows:")
print(clean_maize_df.head(10))
print("\n" + "=" * 80)
print("DATA INFO:")
print("=" * 80)
print(clean_maize_df.info())
print("\n" + "=" * 80)
print("STATISTICAL SUMMARY:")
print("=" * 80)
print(clean_maize_df.describe())

# Optional: Save the cleaned data
clean_maize_df.to_csv('maize_production_clean_2012_2020.csv', index=False)
print("\nCleaned data saved to 'maize_production_clean_2012_2020.csv'")

'''
ALTERNATIVE APPROACH: If you want to keep the wide format but with proper column names
'''

def clean_wide_format(df):
    """
    Clean the data but keep it in wide format with proper column names
    """
    # Remove the first two rows
    clean_wide = df.iloc[2:].reset_index(drop=True).copy()
    
    # Create proper column names
    years = [2012, 2013, 2014, 2015, 2016, 2017, 2018, 2020]
    column_names = ['County']
    
    for year in years:
        column_names.extend([
            f'Area_HA_{year}',
            f'Production_MT_{year}',
            f'Yield_MT_HA_{year}'
        ])
    
    clean_wide.columns = column_names
    
    # Convert numeric columns
    for col in clean_wide.columns:
        if col != 'County':
            clean_wide[col] = pd.to_numeric(clean_wide[col], errors='coerce')
    
    return clean_wide

# Apply alternative cleaning
clean_wide_df = clean_wide_format(maize_df)

print("\n" + "=" * 80)
print("CLEANED WIDE FORMAT PREVIEW")
print("=" * 80)
print(clean_wide_df.head())

'''
ADDITIONAL DATA QUALITY CHECKS
'''

def check_data_quality(df):
    """
    Perform data quality checks on the cleaned dataframe
    """
    print("\n" + "=" * 80)
    print("DATA QUALITY CHECKS")
    print("=" * 80)
    
    # Check for missing values
    missing_values = df.isnull().sum()
    print("Missing values per column:")
    print(missing_values[missing_values > 0])
    
    # Check for duplicates
    duplicates = df.duplicated().sum()
    print(f"\nNumber of duplicate rows: {duplicates}")
    
    # Check for unrealistic values
    print("\nYield outliers (outside 0.1 to 10 MT/HA range):")
    yield_outliers = df[(df['Yield_MT_HA'] < 0.1) | (df['Yield_MT_HA'] > 10)]
    print(yield_outliers)
    
    return df

# Run quality checks on the long format data
check_data_quality(clean_maize_df)

'''
PREPARE DATA FOR MODELING
'''

def prepare_for_modeling(df):
    """
    Prepare the cleaned data for machine learning modeling
    """
    # Create additional features if needed
    df_model = df.copy()
    
    # Add county as categorical feature (one-hot encoding)
    counties = pd.get_dummies(df_model['County'], prefix='County')
    df_model = pd.concat([df_model, counties], axis=1)
    
    # You can add more features here, such as:
    # - Year squared for non-linear relationships
    # - Lag features for time series analysis
    # - Interaction terms
    
    print("\n" + "=" * 80)
    print("DATA READY FOR MODELING")
    print("=" * 80)
    print(f"Final shape: {df_model.shape}")
    print(f"Columns: {list(df_model.columns)}")
    
    return df_model

# Prepare data for modeling
modeling_df = prepare_for_modeling(clean_maize_df)

# Save modeling-ready data
modeling_df.to_csv('maize_data_modeling_ready.csv', index=False)
print("\nModeling-ready data saved to 'maize_data_modeling_ready.csv'")