In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

##  Load data

In [3]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

print('Training data-shape:', train_df.shape)
print('Test data_shape:', test_df.shape)
print('\nTraining data coloums:', train_df.columns.tolist())

Training data-shape: (15289, 18)
Test data_shape: (10194, 17)

Training data coloums: ['id', 'clonesize', 'honeybee', 'bumbles', 'andrena', 'osmia', 'MaxOfUpperTRange', 'MinOfUpperTRange', 'AverageOfUpperTRange', 'MaxOfLowerTRange', 'MinOfLowerTRange', 'AverageOfLowerTRange', 'RainingDays', 'AverageRainingDays', 'fruitset', 'fruitmass', 'seeds', 'yield']


In [6]:
def explore_data(df, name = 'Dataset'):
    print(f'\n--- {name} Overview ---')
    print(f'Shape: {df.shape}')
    print('\nData Types:')
    print(df.dtypes)
    print('\nMissing Values:')
    print(df.isnull().sum())
    print('\nBasic Stats:')
    print(df.describe())

    # Check for Categorical Variables

    categorical_cols = df.select_dtypes(include = ['object']).columns
    if len(categorical_cols) > 0:
        print(f'\nCategorical columns: {list(categorical_cols)}')
        for col in categorical_cols:
            print(f'{col} unique values: {df[col].unique()}')

explore_data(train_df, 'Training Data')
explore_data(test_df,   'Test Data')


--- Training Data Overview ---
Shape: (15289, 18)

Data Types:
id                        int64
clonesize               float64
honeybee                float64
bumbles                 float64
andrena                 float64
osmia                   float64
MaxOfUpperTRange        float64
MinOfUpperTRange        float64
AverageOfUpperTRange    float64
MaxOfLowerTRange        float64
MinOfLowerTRange        float64
AverageOfLowerTRange    float64
RainingDays             float64
AverageRainingDays      float64
fruitset                float64
fruitmass               float64
seeds                   float64
yield                   float64
dtype: object

Missing Values:
id                      0
clonesize               0
honeybee                0
bumbles                 0
andrena                 0
osmia                   0
MaxOfUpperTRange        0
MinOfUpperTRange        0
AverageOfUpperTRange    0
MaxOfLowerTRange        0
MinOfLowerTRange        0
AverageOfLowerTRange    0
RainingDays      

In [7]:
# Identify Target Variable 

if 'yield' in train_df.columns:
    target_col = 'yield'

elif 'target' in train_df.columns:
    target_col = 'target'

else: target_col = train_df.columns[-1]

print(f'\nTarget variable: {target_col}')


Target variable: yield


## Data Preparation Pipeline

In [9]:
# Separate features & target

X = train_df.drop(columns=[target_col])
y = train_df[target_col]

# Identify numerical and categorical columns

numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

print('Numeric features:', numeric_features)
print('Categorical features:', categorical_features)

Numeric features: ['id', 'clonesize', 'honeybee', 'bumbles', 'andrena', 'osmia', 'MaxOfUpperTRange', 'MinOfUpperTRange', 'AverageOfUpperTRange', 'MaxOfLowerTRange', 'MinOfLowerTRange', 'AverageOfLowerTRange', 'RainingDays', 'AverageRainingDays', 'fruitset', 'fruitmass', 'seeds']
Categorical features: []


In [12]:
# Create preprocessing piplines

numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy = 'median')),
                                     ('scaler', StandardScaler())
                                     ])

categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy = 'most_frequent')),
                                         ('onehot', OneHotEncoder(handle_unknown = 'ignore', sparse_output = False))
                                         ])

In [14]:
# Combining preprocessing steps

preprocessor = ColumnTransformer(
    transformers = [
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [15]:
# Split training data for validation

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size = 0.2, random_state = 42
)

## Model Training and Evaluation

In [18]:
# Defining models to try

models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(random_state = 42),
    'Lasso': Lasso(random_state = 42),
    'ElasticNet': ElasticNet(random_state = 42),
    'Random Forest': RandomForestRegressor(random_state = 42),
    'Gradient Boosting': GradientBoostingRegressor(random_state = 42)
}

In [20]:
# Training and evaluating models

results = {}
best_score = float('inf')
best_model = None
best_model_name = ''

for name, model in models.items():
    pipeline = Pipeline(steps = [      # create pipeline with preprocessor and model
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    # Fitting the model

    pipeline.fit(X_train,  y_train)

    # Making predictions

    y_pred = pipeline.predict(X_val)

    # Calculating Mean Absolute Error 

    mae = mean_absolute_error(y_val, y_pred)
    results[name] = mae

    # Cross-validation score

    cv_scores = cross_val_score(pipeline, X_train, y_train,
                               scoring = 'neg_mean_absolute_error', cv = 5)
    cv_mae = -cv_scores.mean()

    print(f'{name}:')
    print(f' Validation MAE: {mae:.4f}')
    print(f' Cross-validation MAE: {cv_mae:.4f}')
    print(f' CV Std: {cv_scores.std():.4f}')

    if mae < best_score:
        best_score = mae
        best_model = pipeline
        best_model_name = name


print(f'\nBest model: {best_model_name} with MAE: {best_score:.4f}') 

Linear Regression:
 Validation MAE: 370.4280
 Cross-validation MAE: 372.4198
 CV Std: 9.4181
Ridge:
 Validation MAE: 370.7253
 Cross-validation MAE: 372.3978
 CV Std: 8.8901
Lasso:
 Validation MAE: 370.6488
 Cross-validation MAE: 372.3321
 CV Std: 8.6408
ElasticNet:
 Validation MAE: 444.6067
 Cross-validation MAE: 443.8287
 CV Std: 9.4806
Random Forest:
 Validation MAE: 359.4741
 Cross-validation MAE: 368.4975
 CV Std: 7.6423
Gradient Boosting:
 Validation MAE: 350.8269
 Cross-validation MAE: 357.4666
 CV Std: 8.6204

Best model: Gradient Boosting with MAE: 350.8269


## Hyperparameter Tuning

In [24]:
# Tuning the best performing model

if best_model_name == 'Random Forest':
    param_grid = {
        'model__n_estimators': [100, 200, 300],
        'model__max_depth': [10, 20, None],
        'model__min_samples_split': [2, 5, 10],
        'model__min_samples_leaf': [1, 2, 4]
    }

elif best_model_name == 'Gradient Boosting':
    param_grid = {
        'model__n_estimators': [100, 200, 300],
        'model__learning_rate': [0.05, 0.1, 0.15],
        'model__max_depth': [3, 4, 5],
        'model__min_samples_split': [2, 5, 10]
    }

else: # For linear mmodels
    param_grid = {
        'model__alpha': [0.1, 1.0, 10.0, 100.0]
    }

# Perform grid search
grid_search = GridSearchCV(
    best_model, param_grid, scoring = 'neg_mean_absolute_error', cv = 5, n_jobs = -1, verbose = 1 
)

print(f'\nTuning {best_model_name}...')
grid_search.fit(X_train, y_train)
print('Best parameters:', grid_search.best_params_)
print('Best cross-validation score:', -grid_search.best_score_) 

# Use the tuned model

best_model = grid_search.best_estimator_


Tuning Gradient Boosting...
Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best parameters: {'model__learning_rate': 0.05, 'model__max_depth': 4, 'model__min_samples_split': 2, 'model__n_estimators': 100}
Best cross-validation score: 355.59401331939824


## Final model Training and Prediction

In [35]:
# Train final model on entire training data

final_pipeline = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('model', best_model.named_steps['model'])
])

# Fit on entire training data

final_pipeline.fit(X,y)

# Make predictions on the test data

test_predictions = final_pipeline.predict(test_df)

## Submission File

In [38]:
submission_df = pd.DataFrame({
    'Id' : test_df.index if 'Id' not in test_df.columns else test_df['Id'],'Predicted': 
    test_predictions
})

# Ensure the columns names match the requirements

submission_df.columns = ['Id', target_col] if 'Id' in submission_df.columns else [target_col]

submission_df.to_csv('submission.csv', index = False)
print("\nSubmission file saved as 'submission.csv' ")
print(f'Prediction range: {test_predictions.min():.2f} to {test_predictions.max():.2f}')


Submission file saved as 'submission.csv' 
Prediction range: 2458.79 to 8504.58
