Importing Dependencies

In [66]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings('ignore')

In [67]:
# 1. Load dataset
df = pd.read_csv('train.csv')

In [68]:
# 2. Drop irrelevant columns
df = df.drop('ID', axis=1)

In [69]:
# 3. Separate features and target
target = 'carbon_footprint'
X = df.drop(target, axis=1)
y = df[target]

In [70]:
# 4. Label Encoding for categorical features
cat_features = ['heating_type', 'diet_type']
for col in cat_features:
    lbl = LabelEncoder()
    X[col] = lbl.fit_transform(X[col].astype(str))

In [71]:
# 5. Convert all other columns to numeric
for col in X.columns:
    if col not in cat_features:
        X[col] = pd.to_numeric(X[col], errors='coerce')

In [72]:
# 6. Fill missing values with 0
X.fillna(0, inplace=True)

In [73]:
# 7. Set LightGBM parameters
lgb_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'learning_rate': 0.01,
    'num_leaves': 31,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1
}

In [74]:
# 8. KFold Cross Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
r2_scores = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y), start=1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = lgb.LGBMRegressor(**lgb_params, n_estimators=10000)
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        callbacks=[lgb.early_stopping(100), lgb.log_evaluation(period=0)]
    )

    y_pred = model.predict(X_val, num_iteration=model.best_iteration_)

    # Calculate normal R² and Challenge Score
    r2 = r2_score(y_val, y_pred)
    r2_scores.append(r2)

    challenge_score = max(0, 100 * r2)
    print(f"Fold {fold} R² Score: {r2:.4f} | Challenge Score: {challenge_score:.2f}")

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1839]	valid_0's rmse: 58.0896
Fold 1 R² Score: 0.9082 | Challenge Score: 90.82
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1546]	valid_0's rmse: 59.2548
Fold 2 R² Score: 0.9037 | Challenge Score: 90.37
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1391]	valid_0's rmse: 67.5807
Fold 3 R² Score: 0.8711 | Challenge Score: 87.11
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1294]	valid_0's rmse: 63.3158
Fold 4 R² Score: 0.8879 | Challenge Score: 88.79
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1262]	valid_0's rmse: 60.5513
Fold 5 R² Score: 0.9005 | Challenge Score: 90.05


In [75]:
average_challenge_score = max(0, 100 * np.mean(r2_scores))
print(f"\nAverage Challenge Score: {average_challenge_score:.2f}")


Average Challenge Score: 89.43


In [76]:
#Load test set into a dataframe
test_df = pd.read_csv('test.csv')

In [77]:
#Consider feature set of test set
X_test = test_df.drop(['ID'], axis=1)

In [78]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

# Apply label encoding
X_test['household_size'] = le.fit_transform(X_test['household_size'])
X_test['heating_type'] = le.fit_transform(X_test['heating_type'])
X_test['diet_type'] = le.fit_transform(X_test['diet_type'])


In [79]:
X_test['house_area_sqft'] = pd.to_numeric(X_test['house_area_sqft'], errors='coerce')


In [55]:
X_test = X_test.astype({'house_area_sqft': 'float', 'household_size': 'float', 'heating_type': 'int', 'diet_type': 'int'})


In [80]:
predictions = model.predict(X_test, num_iteration=model.best_iteration_)


Create a submission.csv file for predicted results on test data

In [81]:
submission = pd.DataFrame({
    'ID': test_df['ID'],
    'carbon_footprint': predictions
})

In [82]:
submission.to_csv('submission.csv', index=False)
