In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import ExtraTreesRegressor, BaggingRegressor
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load data
train = pd.read_csv('/kaggle/input/innovative-ai-challenge-2024/train.csv')
test = pd.read_csv('/kaggle/input/innovative-ai-challenge-2024/test.csv')
sub = pd.read_csv('/kaggle/input/innovative-ai-challenge-2024/sample_submission.csv')


In [3]:
# Drop unnecessary columns
train.drop(columns=['State'], inplace=True)
test.drop(columns=['State'], inplace=True)

In [4]:
# Handle categorical features
categorical_features = ['Crop_Type', 'Soil_Type']
encoders = {}

for feature in categorical_features:
    encoders[feature] = LabelEncoder()
    train[feature] = encoders[feature].fit_transform(train[feature])
    test[feature] = encoders[feature].transform(test[feature])


In [5]:
def create_cyclical_features(df):
    """Create enhanced cyclical features for temporal data"""
    df = df.copy()
    
    # Normalize year
    year_norm = (df['Year'] - df['Year'].min()) / (df['Year'].max() - df['Year'].min())
    
    # Enhanced cyclical features
    df['year_sin'] = np.sin(2 * np.pi * year_norm)
    df['year_cos'] = np.cos(2 * np.pi * year_norm)
    df['year_sin2'] = np.sin(4 * np.pi * year_norm)
    df['year_sin_half'] = np.sin(np.pi * year_norm)
    df['year_trend'] = year_norm  # Linear trend
    df['year_quadratic'] = year_norm ** 2  # Quadratic trend
    df['year_combined'] = df['year_sin'] * df['year_cos']
    
    return df

In [6]:
# Create features
train = create_cyclical_features(train)
test = create_cyclical_features(test)

# Prepare features and target
X = train.drop(columns=['id', 'Crop_Yield (kg/ha)'])
y = train['Crop_Yield (kg/ha)']

In [7]:
# Scale features
scaler = StandardScaler()
X_scaled = pd.DataFrame(
    scaler.fit_transform(X),
    columns=X.columns,
    index=X.index
)
test_scaled = pd.DataFrame(
    scaler.transform(test.set_index('id')),
    columns=X.columns
)


In [8]:
# Initialize cross-validation
loo = LeaveOneOut()
test_preds = []
mse_scores = []
r2_scores = []

In [9]:
# Create optimized base model
base_model = ExtraTreesRegressor(
    n_estimators=100,
    min_samples_split=4,
    min_samples_leaf=2,
    max_features='sqrt',
    random_state=42
)

In [10]:
# Create ensemble with bagging
model = BaggingRegressor(
    estimator=base_model,
    n_estimators=50,
    random_state=42,
    n_jobs=-1  # Use all available cores
)

In [11]:
# Perform Leave-One-Out cross validation
print("Starting training with Leave-One-Out cross validation...")
for fold, (train_idx, val_idx) in enumerate(loo.split(X_scaled), 1):
    # Split data
    X_train, X_val = X_scaled.iloc[train_idx], X_scaled.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Train model
    model.fit(X_train, y_train)
    
    # Make predictions
    val_pred = model.predict(X_val)
    test_pred = model.predict(test_scaled)
    
    # Calculate metrics
    mse = mean_squared_error(y_val, val_pred)
    r2 = r2_score(y_val, val_pred)
    
    mse_scores.append(mse)
    r2_scores.append(r2)
    test_preds.append(test_pred)
    
    # Print progress every 10 folds
    if fold % 10 == 0:
        print(f"Fold {fold} - MSE: {mse:.4f}, R2: {r2:.4f}")



Starting training with Leave-One-Out cross validation...
Fold 10 - MSE: 30711.9515, R2: nan
Fold 20 - MSE: 709313.1539, R2: nan
Fold 30 - MSE: 4299.1751, R2: nan
Fold 40 - MSE: 111498.5352, R2: nan
Fold 50 - MSE: 33627.1566, R2: nan


In [12]:
# Calculate and print final metrics
print("\nFinal Metrics:")
print(f"Mean MSE: {np.mean(mse_scores):.4f}")
print(f"Std MSE: {np.std(mse_scores):.4f}")
print(f"Mean R2: {np.mean(r2_scores):.4f}")
print(f"Std R2: {np.std(r2_scores):.4f}")



Final Metrics:
Mean MSE: 180028.6447
Std MSE: 292747.1095
Mean R2: nan
Std R2: nan


In [13]:
# Generate final predictions
final_predictions = np.mean(test_preds, axis=0)

# Create submission file
sub['Target'] = final_predictions
print("\nSample predictions:")
print(sub.head())



Sample predictions:
     id       Target
0  1001  4195.870388
1  1002  4461.320212
2  1003  3914.554071
3  1004  3904.531255
4  1005  3966.157236


In [14]:
# Save submission
sub.to_csv('submission.csv', index=False)
print("\nSubmission saved to 'submission.csv'")


Submission saved to 'submission.csv'
