# Feature Engineering
## Agriculture Crop Production Dataset

This notebook focuses on feature engineering, creating new features from existing data to improve model performance.

**Objectives:**
- Create temporal features
- Generate derived features
- Encode categorical variables
- Perform feature selection
- Prepare data for model training


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys
import warnings
warnings.filterwarnings('ignore')

# Add src to path
sys.path.append('../src')
from utils.data_loader import load_data, preprocess_data
from utils.preprocessing import FeatureEncoder, prepare_model_features

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("Libraries imported successfully!")


## 1. Load and Preprocess Data


In [None]:
# Load data
df = load_data()
print(f"Original data shape: {df.shape}")

# Preprocess data (handles missing values, outliers)
df_processed = preprocess_data(df)
print(f"Processed data shape: {df_processed.shape}")

# Display first few rows
df_processed.head()


## 2. Temporal Features


In [None]:
# Create temporal features
if 'Year' in df_processed.columns:
    # Year squared (captures non-linear trends)
    df_processed['Year_Squared'] = df_processed['Year'] ** 2
    
    # Year normalized (0-1 scale)
    min_year = df_processed['Year'].min()
    max_year = df_processed['Year'].max()
    df_processed['Year_Normalized'] = (df_processed['Year'] - min_year) / (max_year - min_year)
    
    # Years since start
    df_processed['Years_Since_Start'] = df_processed['Year'] - df_processed['Year'].min()
    
    print("Temporal features created:")
    print(f"  - Year_Squared: {df_processed['Year_Squared'].min():.0f} to {df_processed['Year_Squared'].max():.0f}")
    print(f"  - Year_Normalized: {df_processed['Year_Normalized'].min():.2f} to {df_processed['Year_Normalized'].max():.2f}")
    print(f"  - Years_Since_Start: {df_processed['Years_Since_Start'].min():.0f} to {df_processed['Years_Since_Start'].max():.0f}")
    
    # Visualize temporal features
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))
    
    axes[0].scatter(df_processed['Year'], df_processed['Year_Squared'], alpha=0.6)
    axes[0].set_title('Year vs Year Squared', fontweight='bold')
    axes[0].set_xlabel('Year')
    axes[0].set_ylabel('Year Squared')
    axes[0].grid(True, alpha=0.3)
    
    axes[1].hist(df_processed['Year_Normalized'], bins=20, edgecolor='black')
    axes[1].set_title('Year Normalized Distribution', fontweight='bold')
    axes[1].set_xlabel('Year Normalized')
    axes[1].set_ylabel('Frequency')
    axes[1].grid(True, alpha=0.3)
    
    axes[2].hist(df_processed['Years_Since_Start'], bins=20, edgecolor='black', color='green')
    axes[2].set_title('Years Since Start Distribution', fontweight='bold')
    axes[2].set_xlabel('Years Since Start')
    axes[2].set_ylabel('Frequency')
    axes[2].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
else:
    print("⚠️ 'Year' column not found")


## 3. Derived Features (Cost and Production Related)


In [None]:
# Create derived features
if all(col in df_processed.columns for col in ['Cost', 'Production', 'Quantity']):
    # Cost per unit production
    df_processed['Cost_per_Unit'] = df_processed['Cost'] / (df_processed['Production'] + 1e-6)  # Avoid division by zero
    
    # Production per cost (efficiency metric)
    df_processed['Production_per_Cost'] = df_processed['Production'] / (df_processed['Cost'] + 1e-6)
    
    # Cost per hectare (if area data available, otherwise use yield as proxy)
    df_processed['Cost_per_Hectare'] = df_processed['Cost'] / (df_processed['Quantity'] + 1e-6)
    
    # Profitability indicator (higher production with lower cost)
    df_processed['Profitability_Indicator'] = (df_processed['Production'] / df_processed['Production'].max()) / (df_processed['Cost'] / df_processed['Cost'].max() + 1e-6)
    
    print("Derived features created:")
    print(f"  - Cost_per_Unit: {df_processed['Cost_per_Unit'].min():.2f} to {df_processed['Cost_per_Unit'].max():.2f}")
    print(f"  - Production_per_Cost: {df_processed['Production_per_Cost'].min():.2f} to {df_processed['Production_per_Cost'].max():.2f}")
    print(f"  - Cost_per_Hectare: {df_processed['Cost_per_Hectare'].min():.2f} to {df_processed['Cost_per_Hectare'].max():.2f}")
    
    # Visualize derived features
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    df_processed['Cost_per_Unit'].hist(bins=50, ax=axes[0, 0], edgecolor='black')
    axes[0, 0].set_title('Cost per Unit Distribution', fontweight='bold')
    axes[0, 0].set_xlabel('Cost per Unit')
    axes[0, 0].set_ylabel('Frequency')
    
    df_processed['Production_per_Cost'].hist(bins=50, ax=axes[0, 1], edgecolor='black', color='green')
    axes[0, 1].set_title('Production per Cost Distribution', fontweight='bold')
    axes[0, 1].set_xlabel('Production per Cost')
    axes[0, 1].set_ylabel('Frequency')
    
    axes[1, 0].scatter(df_processed['Cost'], df_processed['Production'], alpha=0.5)
    axes[1, 0].set_title('Cost vs Production', fontweight='bold')
    axes[1, 0].set_xlabel('Cost')
    axes[1, 0].set_ylabel('Production')
    axes[1, 0].grid(True, alpha=0.3)
    
    df_processed['Profitability_Indicator'].hist(bins=50, ax=axes[1, 1], edgecolor='black', color='orange')
    axes[1, 1].set_title('Profitability Indicator Distribution', fontweight='bold')
    axes[1, 1].set_xlabel('Profitability Indicator')
    axes[1, 1].set_ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()
else:
    print("⚠️ Required columns (Cost, Production, Quantity) not found")


## 4. Categorical Encoding


In [None]:
# Encode categorical variables
categorical_cols = ['Crop', 'State', 'Season'] if all(col in df_processed.columns for col in ['Crop', 'State', 'Season']) else []

if categorical_cols:
    # Create FeatureEncoder
    encoder = FeatureEncoder()
    
    # Fit encoder (requires categorical_cols parameter)
    encoder.fit(df_processed, categorical_cols)
    
    # Transform data (requires categorical_cols parameter)
    df_encoded = encoder.transform(df_processed.copy(), categorical_cols)
    
    print("Categorical encoding completed:")
    for col in categorical_cols:
        if col in df_processed.columns:
            print(f"  - {col}: {df_processed[col].nunique()} unique values")
            print(f"    Encoded values: {df_encoded[col].min()} to {df_encoded[col].max()}")
    
    # Show encoding mapping
    print("\nEncoding mappings (sample):")
    for col in categorical_cols[:2]:  # Show first 2 columns
        if col in encoder.label_encoders:
            le = encoder.label_encoders[col]
            print(f"\n{col} encoding (first 5):")
            for i, label in enumerate(le.classes_[:5]):
                print(f"  {label} -> {i}")
    
    # Visualize encoded distributions
    fig, axes = plt.subplots(1, len(categorical_cols), figsize=(15, 5))
    if len(categorical_cols) == 1:
        axes = [axes]
    
    for idx, col in enumerate(categorical_cols):
        if col in df_encoded.columns:
            df_encoded[col].value_counts().sort_index().head(20).plot(kind='bar', ax=axes[idx])
            axes[idx].set_title(f'{col} Encoded Distribution', fontweight='bold')
            axes[idx].set_xlabel(f'{col} (Encoded)')
            axes[idx].set_ylabel('Frequency')
            axes[idx].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()
    
    df_processed = df_encoded
else:
    print("⚠️ Categorical columns not found")


## 5. Feature Selection and Importance


In [None]:
# Feature selection using correlation and variance
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import RandomForestRegressor

# Prepare features and target
target_col = 'Quantity' if 'Quantity' in df_processed.columns else df_processed.select_dtypes(include=[np.number]).columns[-1]

# Select numerical features
feature_cols = df_processed.select_dtypes(include=[np.number]).columns.tolist()
feature_cols = [col for col in feature_cols if col != target_col and 'Quantity' not in col]

X = df_processed[feature_cols].fillna(0)
y = df_processed[target_col].fillna(0)

print(f"Target variable: {target_col}")
print(f"Number of features: {len(feature_cols)}")

# Calculate feature importance using Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X, y)

# Get feature importance
feature_importance = pd.DataFrame({
    'Feature': feature_cols,
    'Importance': rf.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(feature_importance.head(10).to_string(index=False))

# Visualize feature importance
plt.figure(figsize=(12, 8))
top_features = feature_importance.head(15)
sns.barplot(data=top_features, y='Feature', x='Importance', palette='viridis')
plt.title('Top 15 Feature Importance (Random Forest)', fontsize=16, fontweight='bold')
plt.xlabel('Importance Score')
plt.tight_layout()
plt.show()


## 6. Feature Correlation Analysis


In [None]:
# Analyze feature correlations
numerical_features = df_processed.select_dtypes(include=[np.number]).columns.tolist()
correlation_matrix = df_processed[numerical_features].corr()

# Visualize correlation matrix
plt.figure(figsize=(14, 12))
sns.heatmap(correlation_matrix, annot=False, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Matrix', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

# Find highly correlated features (potential redundancy)
high_corr_pairs = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        corr_value = correlation_matrix.iloc[i, j]
        if abs(corr_value) > 0.8:  # High correlation threshold
            high_corr_pairs.append({
                'Feature 1': correlation_matrix.columns[i],
                'Feature 2': correlation_matrix.columns[j],
                'Correlation': corr_value
            })

if high_corr_pairs:
    print("\nHighly Correlated Feature Pairs (|r| > 0.8):")
    print(pd.DataFrame(high_corr_pairs).to_string(index=False))
else:
    print("\n✅ No highly correlated feature pairs found (|r| > 0.8)")


## 7. Final Feature Set Preparation


In [None]:
# Prepare final feature set for modeling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Select final features (exclude target and identifier columns)
exclude_cols = [target_col, 'Year', 'Season_Duration', 'Recommended_Zone', 'Variety', 'Unit']
final_features = [col for col in feature_cols if col not in exclude_cols]

X_final = df_processed[final_features].fillna(0)
y_final = df_processed[target_col].fillna(0)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_final, y_final, test_size=0.2, random_state=42
)

print("Final Feature Set:")
print(f"  Total features: {len(final_features)}")
print(f"  Training samples: {len(X_train)}")
print(f"  Test samples: {len(X_test)}")
print(f"\nFeatures included:")
for i, feat in enumerate(final_features, 1):
    print(f"  {i}. {feat}")

# Optional: Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\n✅ Features scaled and ready for modeling!")


## 8. Summary and Next Steps

### Features Created:
1. **Temporal Features:**
   - Year_Squared
   - Year_Normalized
   - Years_Since_Start

2. **Derived Features:**
   - Cost_per_Unit
   - Production_per_Cost
   - Cost_per_Hectare
   - Profitability_Indicator

3. **Encoded Features:**
   - Crop_encoded
   - State_encoded
   - Season_encoded

### Key Insights:
- [Add insights from feature importance analysis]
- [Add observations about feature correlations]
- [Add recommendations for feature selection]

### Next Steps:
- Proceed to model training (Week 3)
- Use prepared features for Random Forest and XGBoost
- Consider feature selection based on importance scores
