In [36]:
import pandas as pd
import numpy as np
import warnings
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge

warnings.filterwarnings('ignore')

In [37]:
# Load Data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

Data cleaning and feature engineering

In [38]:
print(f"Initial Train Shape: {train.shape}")
print(f"Initial Test Shape: {test.shape}")

Initial Train Shape: (27999, 12)
Initial Test Shape: (7041, 11)


In [39]:
# Backup IDs
test_ids = test['Id'].copy()

# Extract Date Features
print("\nExtracting date features...")

for df in [train, test]:
    df['date'] = pd.to_datetime(df['date'], format='mixed', dayfirst=True)
    df['hour'] = df['date'].dt.hour
    df['day'] = df['date'].dt.day
    df['month'] = df['date'].dt.month
    df['dayofweek'] = df['date'].dt.dayofweek
    df['quarter'] = df['date'].dt.quarter
    df['is_weekend'] = (df['date'].dt.dayofweek >= 5).astype(int)
    df['is_night'] = ((df['hour'] >= 22) | (df['hour'] <= 6)).astype(int)
    df['is_peak_hour'] = ((df['hour'] >= 17) & (df['hour'] <= 21)).astype(int)
    
print("Date features extracted.")


Extracting date features...
Date features extracted.


In [40]:
# Drop ID and date columns
train.drop(['Id', 'date'], axis=1, inplace=True)
test.drop(['Id', 'date'], axis=1, inplace=True)

In [41]:
# Handle Missing Values
print("\nHandling Missing Values...")

num_cols = train.select_dtypes(include=[np.number]).columns.tolist()
if 'Usage_kWh' in num_cols:
    num_cols.remove('Usage_kWh')

for col in num_cols:
    median_val = train[col].median()
    train[col].fillna(median_val, inplace=True)
    if col in test.columns:
        test[col].fillna(median_val, inplace=True)

cat_cols = train.select_dtypes(include=['object']).columns.tolist()
for col in cat_cols:
    mode_val = train[col].mode()[0]
    train[col].fillna(mode_val, inplace=True)
    if col in test.columns:
        test[col].fillna(mode_val, inplace=True)

print("Missing values handled.")

# Skip Outlier Removal- preserving signal for better R²


Handling Missing Values...
Missing values handled.


In [42]:
# Remove Duplicates
duplicates = train.duplicated().sum()
if duplicates > 0:
    train.drop_duplicates(inplace=True)
    print(f"Removed {duplicates} duplicate rows.")
else:
    print("No duplicate rows found.")

No duplicate rows found.


In [43]:
# Feature Engineering
print("\nCreating engineered and domain-driven features...")

for df_name, df in [('train', train), ('test', test)]:
    # Efficiency ratios
    df['CO2_per_NSM'] = df['CO2(tCO2)'] / (df['NSM'] + 1)
    df['Lagging_per_NSM'] = df['Lagging_Current_Reactive.Power_kVarh'] / (df['NSM'] + 1)
    df['Leading_per_NSM'] = df['Leading_Current_Reactive_Power_kVarh'] / (df['NSM'] + 1)

    # Power factor combinations
    df['Total_PF'] = df['Lagging_Current_Power_Factor'] + df['Leading_Current_Power_Factor']
    df['PF_Ratio'] = df['Lagging_Current_Power_Factor'] / (df['Leading_Current_Power_Factor'] + 1)
    df['Total_Reactive'] = (
        df['Lagging_Current_Reactive.Power_kVarh'] +
        df['Leading_Current_Reactive_Power_kVarh']
    )

    # Log and squared transforms
    df['NSM_Log'] = np.log1p(df['NSM'])
    df['CO2_Log'] = np.log1p(df['CO2(tCO2)'])
    df['NSM_Squared'] = (df['NSM'] / 1000) ** 2
    df['CO2_Squared'] = df['CO2(tCO2)'] ** 2

    # Domain-driven features
    df['Power_Factor_Efficiency'] = (
        df['Lagging_Current_Power_Factor'] + df['Leading_Current_Power_Factor']
    ) / 2
    df['Inductive_Load_Index'] = (
        df['Lagging_Current_Reactive.Power_kVarh'] -
        df['Leading_Current_Reactive_Power_kVarh']
    )
    df['Capacitive_Correction_Potential'] = (
        df['Leading_Current_Reactive_Power_kVarh'] /
        (df['Lagging_Current_Reactive.Power_kVarh'] + 1)
    )
    df['PF_Variation'] = abs(
        df['Lagging_Current_Power_Factor'] - df['Leading_Current_Power_Factor']
    )

    # Cyclical encodings for time
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

    # Power factor-inspired physical features
    df['Apparent_Power_Est'] = np.sqrt(
        (df['Lagging_Current_Reactive.Power_kVarh'] + df['Leading_Current_Reactive_Power_kVarh'])**2 +
        (df['CO2(tCO2)'])**2
    )

    df['Reactive_to_Apparent'] = (
        (df['Lagging_Current_Reactive.Power_kVarh'] + df['Leading_Current_Reactive_Power_kVarh']) /
        (df['Apparent_Power_Est'] + 1)
    )

    df['PF_Theoretical'] = 1 / np.sqrt(1 + df['Reactive_to_Apparent']**2)

    df['Reactive_Imbalance'] = abs(
        df['Lagging_Current_Reactive.Power_kVarh'] - df['Leading_Current_Reactive_Power_kVarh']
    )

    df['Mean_PF'] = (
        df['Lagging_Current_Power_Factor'] + df['Leading_Current_Power_Factor']
    ) / 2

print("Engineered features created.")


Creating engineered and domain-driven features...
Engineered features created.


In [44]:
# Encode Categorical Data
print("\nEncoding categorical columns...")

train = pd.get_dummies(train, columns=cat_cols, drop_first=True)
test = pd.get_dummies(test, columns=cat_cols, drop_first=True)

# Align columns
missing_in_test = set(train.columns) - set(test.columns) - {'Usage_kWh'}
for col in missing_in_test:
    test[col] = 0
test = test[[c for c in train.columns if c != 'Usage_kWh']]



Encoding categorical columns...


In [45]:
# Reattach IDs
test['Id'] = test_ids.values

print("Encoding and alignment complete.")
print(f"Final shapes: Train={train.shape}, Test={test.shape}")

Encoding and alignment complete.
Final shapes: Train=(27999, 47), Test=(7041, 47)


In [46]:
# Drop redundant features
print("\nDropping redundant features...")
drop_cols = [
    "Power_Factor_Efficiency",
    "Mean_PF",
    "Total_PF",
    "PF_Variation",
    "Apparent_Power_Est",
    "Reactive_to_Apparent",
    "NSM",
    "CO2(tCO2)",
]

dropped_cols = [c for c in drop_cols if c in train.columns]

train.drop(columns=dropped_cols, inplace=True)
test.drop(columns=dropped_cols, inplace=True)

print(f"Dropped {len(dropped_cols)} redundant columns: {dropped_cols}")
print(f"Final optimized shapes: Train={train.shape}, Test={test.shape}")



Dropping redundant features...
Dropped 8 redundant columns: ['Power_Factor_Efficiency', 'Mean_PF', 'Total_PF', 'PF_Variation', 'Apparent_Power_Est', 'Reactive_to_Apparent', 'NSM', 'CO2(tCO2)']
Final optimized shapes: Train=(27999, 39), Test=(7041, 39)


In [47]:
# Save cleaned and engineered data
train.to_csv("train_cleaned_engineered_v6.csv", index=False)
test.to_csv("test_cleaned_engineered_v6.csv", index=False)

print("\nData cleaning and feature engineering completed successfully.")
print("Saved files:")
print("  - train_cleaned_engineered_v6.csv")
print("  - test_cleaned_engineered_v6.csv (includes Id column)")
print("="*70)


Data cleaning and feature engineering completed successfully.
Saved files:
  - train_cleaned_engineered_v6.csv
  - test_cleaned_engineered_v6.csv (includes Id column)


Model training and prediction

In [48]:
# Model Training and Prediction
print("\nStarting model training...")

# Load cleaned data
train = pd.read_csv("train_cleaned_engineered_v6.csv")
test = pd.read_csv("test_cleaned_engineered_v6.csv")

# Select features
winning_features = [
    'CO2_Log',
    'Lagging_Current_Reactive.Power_kVarh',
    'CO2_Squared',
    'Total_Reactive',
    'Inductive_Load_Index',
    'Reactive_Imbalance',
    'PF_Theoretical',
    'is_night',
    'Lagging_Current_Power_Factor',
    'hour_cos',
    'Leading_Current_Power_Factor',
    'hour_sin'
]

X = train[winning_features]
y = train["Usage_kWh"]
X_test = test[winning_features]
test_ids = test["Id"]

# Box-Cox transformation
if (y <= 0).any():
    y_shift = abs(y.min()) + 1
    y_temp = y + y_shift
else:
    y_shift = 0
    y_temp = y

pt = PowerTransformer(method='box-cox', standardize=False)
y_transformed = pt.fit_transform(y_temp.values.reshape(-1, 1)).ravel()



Starting model training...


In [49]:
# Pipeline with Ridge regression
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("poly", PolynomialFeatures(degree=5, include_bias=False)),
    ("ridge", Ridge(alpha=0.006, max_iter=10000))
])


In [50]:
print("Training Ridge model with alpha=0.006...")
pipeline.fit(X, y_transformed)

# Predict
test_pred_transformed = pipeline.predict(X_test)
test_pred = pt.inverse_transform(test_pred_transformed.reshape(-1, 1)).ravel()

if y_shift > 0:
    test_pred = test_pred - y_shift

# Handle NaN values
if np.isnan(test_pred).any():
    median_pred = np.nanmedian(test_pred)
    test_pred = np.where(np.isnan(test_pred), median_pred, test_pred)

test_pred = np.clip(test_pred, 0, y.max() * 1.4)

Training Ridge model with alpha=0.006...


In [51]:
# Create submission
submission = pd.DataFrame({
    "Id": test_ids,
    "Usage_kWh": test_pred
})

submission.to_csv("submission_alpha_004.csv", index=False)
print(f"\nSubmission saved: submission_alpha_004.csv")
print(f"Null values: {submission.isnull().sum().sum()}")
print(f"Prediction range: [{submission['Usage_kWh'].min():.2f}, {submission['Usage_kWh'].max():.2f}]")


Submission saved: submission_alpha_004.csv
Null values: 0
Prediction range: [2.59, 155.71]
