# AgroGraphNet: Data Collection and Initial Exploration

This notebook handles data acquisition and initial exploration for the AgroGraphNet project.

## Objectives:
1. Set up the data directory structure
2. Create sample datasets (or load real data)
3. Perform initial data exploration
4. Validate data quality and completeness

In [None]:
# Import required libraries
import sys
import os
sys.path.append('../src')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Import custom modules
from config import *
from data_utils import *
from visualization import *

# Set random seed for reproducibility
np.random.seed(RANDOM_SEED)

print("Libraries imported successfully!")
print(f"Project root: {PROJECT_ROOT}")
print(f"Data directory: {DATA_DIR}")

## 1. Data Directory Setup

First, let's ensure all necessary directories exist and check the current data structure.

In [None]:
# Check and create directory structure
directories = {
    'Raw Data': RAW_DATA_DIR,
    'Satellite': SATELLITE_DIR,
    'Weather': WEATHER_DIR,
    'Disease Labels': DISEASE_LABELS_DIR,
    'Farm Locations': FARM_LOCATIONS_DIR,
    'Processed': PROCESSED_DATA_DIR,
    'Graphs': GRAPHS_DIR,
    'Labels': LABELS_DIR,
    'Models': MODELS_DIR,
    'Results': RESULTS_DIR
}

print("Directory Structure:")
print("=" * 50)
for name, path in directories.items():
    exists = "✓" if path.exists() else "✗"
    print(f"{exists} {name}: {path}")
    
    # Create directory if it doesn't exist
    if not path.exists():
        path.mkdir(parents=True, exist_ok=True)
        print(f"  → Created directory: {path}")

## 2. Data Collection Options

You have two options for data:

### Option A: Use Real Data
If you have real datasets, place them in the following locations:
- **Satellite imagery**: `data/raw/satellite/` (GeoTIFF files)
- **Weather data**: `data/raw/weather/weather_data.csv`
- **Disease labels**: `data/raw/disease_labels/disease_data.csv`
- **Farm locations**: `data/raw/farm_locations/farm_locations.csv`

### Option B: Generate Sample Data
For demonstration purposes, we'll create realistic sample data.

In [None]:
# Check if real data exists
real_data_files = {
    'Weather': WEATHER_DIR / 'weather_data.csv',
    'Disease': DISEASE_LABELS_DIR / 'disease_data.csv',
    'Farms': FARM_LOCATIONS_DIR / 'farm_locations.csv'
}

real_data_exists = all(file.exists() for file in real_data_files.values())

print("Real Data Check:")
print("=" * 30)
for name, file_path in real_data_files.items():
    exists = "✓" if file_path.exists() else "✗"
    print(f"{exists} {name}: {file_path.name}")

if not real_data_exists:
    print("\n🔄 Real data not found. Generating sample data...")
    
    # Generate sample data
    create_sample_data(
        output_dir=str(RAW_DATA_DIR),
        num_farms=100,
        num_time_steps=12
    )
    
    print("✅ Sample data generated successfully!")
else:
    print("\n✅ Real data found. Using existing datasets.")

## 3. Load and Explore Data

Now let's load the datasets and perform initial exploration.

In [None]:
# Load datasets
print("Loading datasets...")

# Farm locations
farm_files = list(FARM_LOCATIONS_DIR.glob('*.csv'))
if farm_files:
    farms_df = pd.read_csv(farm_files[0])
    print(f"✅ Loaded farm locations: {len(farms_df)} farms")
else:
    print("❌ No farm location files found")

# Weather data
weather_files = list(WEATHER_DIR.glob('*.csv'))
if weather_files:
    weather_df = load_weather_data(str(weather_files[0]))
    print(f"✅ Loaded weather data: {len(weather_df)} records")
else:
    print("❌ No weather files found")

# Disease data
disease_files = list(DISEASE_LABELS_DIR.glob('*.csv'))
if disease_files:
    disease_df = load_disease_labels(str(disease_files[0]))
    print(f"✅ Loaded disease data: {len(disease_df)} records")
else:
    print("❌ No disease files found")

In [None]:
# Display basic information about each dataset
print("Dataset Overview:")
print("=" * 50)

if 'farms_df' in locals():
    print("\n📍 FARM LOCATIONS:")
    print(f"Shape: {farms_df.shape}")
    print(f"Columns: {list(farms_df.columns)}")
    print("\nFirst 5 rows:")
    display(farms_df.head())
    
    print("\nCrop type distribution:")
    print(farms_df['crop_type'].value_counts())

if 'weather_df' in locals():
    print("\n🌤️ WEATHER DATA:")
    print(f"Shape: {weather_df.shape}")
    print(f"Columns: {list(weather_df.columns)}")
    print(f"Date range: {weather_df['date'].min()} to {weather_df['date'].max()}")
    print("\nFirst 5 rows:")
    display(weather_df.head())

if 'disease_df' in locals():
    print("\n🦠 DISEASE DATA:")
    print(f"Shape: {disease_df.shape}")
    print(f"Columns: {list(disease_df.columns)}")
    print(f"Date range: {disease_df['date'].min()} to {disease_df['date'].max()}")
    print("\nDisease type distribution:")
    print(disease_df['disease_type'].value_counts())
    print("\nFirst 5 rows:")
    display(disease_df.head())

## 4. Data Quality Assessment

In [None]:
# Check for missing values and data quality issues
print("Data Quality Assessment:")
print("=" * 40)

datasets = {
    'Farms': farms_df if 'farms_df' in locals() else None,
    'Weather': weather_df if 'weather_df' in locals() else None,
    'Disease': disease_df if 'disease_df' in locals() else None
}

for name, df in datasets.items():
    if df is not None:
        print(f"\n{name} Dataset:")
        print(f"  Total records: {len(df)}")
        print(f"  Missing values:")
        missing = df.isnull().sum()
        for col, count in missing.items():
            if count > 0:
                print(f"    {col}: {count} ({count/len(df)*100:.1f}%)")
        
        if missing.sum() == 0:
            print("    ✅ No missing values")
        
        # Check for duplicates
        duplicates = df.duplicated().sum()
        if duplicates > 0:
            print(f"  ⚠️ Duplicate records: {duplicates}")
        else:
            print(f"  ✅ No duplicate records")

## 5. Exploratory Data Analysis

In [None]:
# Create visualizations for data exploration
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

if 'farms_df' in locals():
    # Farm locations scatter plot
    axes[0, 0].scatter(farms_df['lon'], farms_df['lat'], 
                      c=farms_df['crop_type'].astype('category').cat.codes, 
                      cmap='Set1', alpha=0.7)
    axes[0, 0].set_title('Farm Locations')
    axes[0, 0].set_xlabel('Longitude')
    axes[0, 0].set_ylabel('Latitude')
    
    # Farm area distribution
    axes[0, 1].hist(farms_df['area_hectares'], bins=20, alpha=0.7, color='skyblue')
    axes[0, 1].set_title('Farm Area Distribution')
    axes[0, 1].set_xlabel('Area (hectares)')
    axes[0, 1].set_ylabel('Frequency')

if 'weather_df' in locals():
    # Temperature over time
    monthly_temp = weather_df.groupby(weather_df['date'].dt.to_period('M'))['temperature'].mean()
    axes[1, 0].plot(monthly_temp.index.astype(str), monthly_temp.values, 'o-', color='red')
    axes[1, 0].set_title('Average Temperature Over Time')
    axes[1, 0].set_xlabel('Month')
    axes[1, 0].set_ylabel('Temperature (°C)')
    axes[1, 0].tick_params(axis='x', rotation=45)

if 'disease_df' in locals():
    # Disease distribution
    disease_counts = disease_df['disease_type'].value_counts()
    axes[1, 1].pie(disease_counts.values, labels=disease_counts.index, autopct='%1.1f%%')
    axes[1, 1].set_title('Disease Type Distribution')

plt.tight_layout()
plt.savefig(RESULTS_DIR / '01_data_exploration.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Create interactive map of farm locations
if 'farms_df' in locals() and 'disease_df' in locals():
    print("Creating interactive map of farm locations...")
    
    # Get latest disease data for each farm
    latest_disease = disease_df.loc[disease_df.groupby('farm_id')['date'].idxmax()]
    
    # Create map
    farm_map = plot_farm_locations(
        farms_df, 
        latest_disease, 
        save_path=str(RESULTS_DIR / '01_farm_locations_map.html')
    )
    
    print(f"✅ Interactive map saved to: {RESULTS_DIR / '01_farm_locations_map.html'}")
    print("Open this file in a web browser to view the interactive map.")
    
    # Display map in notebook (if running in Jupyter)
    try:
        display(farm_map)
    except:
        print("Map created but cannot display inline. Check the saved HTML file.")

## 6. Statistical Summary

In [None]:
# Generate comprehensive statistical summary
print("Statistical Summary:")
print("=" * 50)

if 'farms_df' in locals():
    print("\n📊 FARM STATISTICS:")
    print(f"Total farms: {len(farms_df)}")
    print(f"Crop types: {farms_df['crop_type'].nunique()}")
    print(f"Geographic extent:")
    print(f"  Latitude: {farms_df['lat'].min():.3f} to {farms_df['lat'].max():.3f}")
    print(f"  Longitude: {farms_df['lon'].min():.3f} to {farms_df['lon'].max():.3f}")
    print(f"Farm area statistics:")
    print(f"  Mean: {farms_df['area_hectares'].mean():.1f} hectares")
    print(f"  Median: {farms_df['area_hectares'].median():.1f} hectares")
    print(f"  Range: {farms_df['area_hectares'].min():.1f} - {farms_df['area_hectares'].max():.1f} hectares")

if 'weather_df' in locals():
    print("\n🌡️ WEATHER STATISTICS:")
    print(f"Total weather records: {len(weather_df)}")
    print(f"Time period: {weather_df['date'].min().strftime('%Y-%m-%d')} to {weather_df['date'].max().strftime('%Y-%m-%d')}")
    print(f"Temperature range: {weather_df['temperature'].min():.1f}°C to {weather_df['temperature'].max():.1f}°C")
    print(f"Average humidity: {weather_df['humidity'].mean():.1f}%")
    print(f"Total precipitation range: {weather_df['precipitation'].min():.1f} to {weather_df['precipitation'].max():.1f} mm")

if 'disease_df' in locals():
    print("\n🦠 DISEASE STATISTICS:")
    print(f"Total disease records: {len(disease_df)}")
    print(f"Unique farms affected: {disease_df['farm_id'].nunique()}")
    print(f"Disease types: {disease_df['disease_type'].nunique()}")
    
    # Disease prevalence over time
    disease_by_month = disease_df.groupby([disease_df['date'].dt.to_period('M'), 'disease_type']).size().unstack(fill_value=0)
    print(f"\nDisease prevalence by month:")
    display(disease_by_month)
    
    # Severity statistics for non-healthy cases
    diseased = disease_df[disease_df['disease_type'] != 'Healthy']
    if len(diseased) > 0:
        print(f"\nDisease severity statistics (non-healthy cases):")
        print(f"  Mean severity: {diseased['severity'].mean():.3f}")
        print(f"  Median severity: {diseased['severity'].median():.3f}")
        print(f"  Severity range: {diseased['severity'].min():.3f} - {diseased['severity'].max():.3f}")

## 7. Data Validation and Next Steps

In [None]:
# Validate data consistency and relationships
print("Data Validation:")
print("=" * 30)

validation_passed = True

if 'farms_df' in locals() and 'disease_df' in locals():
    # Check if all farms in disease data exist in farm locations
    farm_ids_in_disease = set(disease_df['farm_id'].unique())
    farm_ids_in_locations = set(farms_df['farm_id'].unique())
    
    missing_farms = farm_ids_in_disease - farm_ids_in_locations
    if missing_farms:
        print(f"⚠️ Warning: {len(missing_farms)} farms in disease data not found in farm locations")
        validation_passed = False
    else:
        print("✅ All farms in disease data have corresponding location data")

if 'weather_df' in locals() and 'disease_df' in locals():
    # Check temporal alignment
    weather_dates = set(weather_df['date'].dt.date)
    disease_dates = set(disease_df['date'].dt.date)
    
    common_dates = weather_dates & disease_dates
    print(f"✅ Common dates between weather and disease data: {len(common_dates)}")
    
    if len(common_dates) == 0:
        print("⚠️ Warning: No overlapping dates between weather and disease data")
        validation_passed = False

# Check coordinate validity
if 'farms_df' in locals():
    invalid_coords = farms_df[
        (farms_df['lat'].abs() > 90) | 
        (farms_df['lon'].abs() > 180)
    ]
    
    if len(invalid_coords) > 0:
        print(f"⚠️ Warning: {len(invalid_coords)} farms have invalid coordinates")
        validation_passed = False
    else:
        print("✅ All farm coordinates are valid")

print(f"\nOverall validation: {'✅ PASSED' if validation_passed else '❌ FAILED'}")

if validation_passed:
    print("\n🎉 Data collection and validation completed successfully!")
    print("\nNext steps:")
    print("1. Run notebook 02_data_preprocessing.ipynb")
    print("2. Process satellite imagery and environmental data")
    print("3. Create temporal features and handle missing values")
else:
    print("\n⚠️ Please address the validation issues before proceeding.")

## Summary

This notebook has completed the following tasks:

1. ✅ Set up the project directory structure
2. ✅ Created/loaded datasets (farm locations, weather, disease data)
3. ✅ Performed initial data exploration and quality assessment
4. ✅ Generated visualizations and statistical summaries
5. ✅ Validated data consistency and relationships
6. ✅ Created interactive map of farm locations

The data is now ready for preprocessing in the next notebook. If you're using real satellite imagery, make sure to place your GeoTIFF files in the `data/raw/satellite/` directory before proceeding to the next notebook.