In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
import os
import warnings
warnings.filterwarnings('ignore')


In [3]:
# Function to safely read CSV files with error handling
def safe_read_csv(file_path):
    try:
        # Try reading with default settings
        df = pd.read_csv(file_path)
        return df, None
    except Exception as e1:
        try:
            # Try reading with explicit encoding
            df = pd.read_csv(file_path, encoding='utf-8')
            return df, None
        except Exception as e2:
            try:
                # Try reading with different separator
                df = pd.read_csv(file_path, sep='\t')
                return df, None
            except Exception as e3:
                return None, f"Failed to read {file_path}: {str(e3)}"

# Display progress information
print("Starting weather forecasting model...")

# Load datasets with error handling
print("Loading training data...")
train, train_error = safe_read_csv('train.csv')
if train_error:
    print(train_error)
    exit(1)

print("Loading test data...")
test, test_error = safe_read_csv('test.csv')
if test_error:
    print(test_error)
    exit(1)

print(f"Train data shape: {train.shape}")
print(f"Test data shape: {test.shape}")


Starting weather forecasting model...
Loading training data...
Loading test data...
Train data shape: (84960, 10)
Test data shape: (4530, 5)


In [4]:
# Validate the required columns exist
required_train_cols = ['ID', 'Year', 'Month', 'Day', 'kingdom', 'Avg_Temperature', 
                       'Radiation', 'Rain_Amount', 'Wind_Speed', 'Wind_Direction']
required_test_cols = ['ID', 'Year', 'Month', 'Day', 'kingdom']

# Check train columns
missing_train_cols = [col for col in required_train_cols if col not in train.columns]
if missing_train_cols:
    print(f"Error: Training data missing required columns: {missing_train_cols}")
    print(f"Available columns: {train.columns.tolist()}")
    exit(1)

# Check test columns
missing_test_cols = [col for col in required_test_cols if col not in test.columns]
if missing_test_cols:
    print(f"Error: Test data missing required columns: {missing_test_cols}")
    print(f"Available columns: {test.columns.tolist()}")
    exit(1)


In [5]:
# Clean and prepare data
print("Preparing data...")

# Handling duplicate rows if any
train = train.drop_duplicates()
test = test.drop_duplicates()

# Check for and remove rows with all missing values
train = train.dropna(how='all')
test = test.dropna(how='all')

# Ensure numeric values are properly formatted
for col in ['Year', 'Month', 'Day']:
    train[col] = pd.to_numeric(train[col], errors='coerce')
    test[col] = pd.to_numeric(test[col], errors='coerce')

# Drop rows with invalid date components
train = train.dropna(subset=['Year', 'Month', 'Day'])
test = test.dropna(subset=['Year', 'Month', 'Day'])

# Convert to appropriate types
train['Year'] = train['Year'].astype(int)
train['Month'] = train['Month'].astype(int)
train['Day'] = train['Day'].astype(int)
test['Year'] = test['Year'].astype(int)
test['Month'] = test['Month'].astype(int)
test['Day'] = test['Day'].astype(int)

# Handle the temperature unit issue - convert Kelvin to Celsius
print("Converting temperature units...")
temp_mask = train['Avg_Temperature'] > 100  # Threshold to identify Kelvin values
train.loc[temp_mask, 'Avg_Temperature'] = train.loc[temp_mask, 'Avg_Temperature'] - 273.15

# Create date keys for sorting (avoid datetime conversion issues)
train['DateKey'] = train['Year']*10000 + train['Month']*100 + train['Day']
test['DateKey'] = test['Year']*10000 + test['Month']*100 + test['Day']

# Generate temporal features
print("Creating features...")
for df in [train, test]:
    # Create approximation for day of year
    df['DayOfYear'] = (df['Month'] - 1) * 30 + df['Day']
    df['MonthDay'] = df['Day']
    df['Season'] = (df['Month'] % 12 + 3) // 3  # 1: Spring, 2: Summer, 3: Fall, 4: Winter
    # Create month-day combination for seasonal patterns
    df['MonthDay_Combined'] = df['Month'] * 100 + df['Day']

# Encode categorical variables
try:
    le = LabelEncoder()
    # Handle potential encoding errors by forcing string type
    train['kingdom_str'] = train['kingdom'].astype(str)
    test['kingdom_str'] = test['kingdom'].astype(str)
    train['kingdom_encoded'] = le.fit_transform(train['kingdom_str'])
    test['kingdom_encoded'] = le.transform(test['kingdom_str'])
except Exception as e:
    print(f"Error encoding kingdom: {str(e)}")
    # Fallback: use numeric encoding if LabelEncoder fails
    kingdom_mapping = {k: i for i, k in enumerate(train['kingdom'].unique())}
    train['kingdom_encoded'] = train['kingdom'].map(kingdom_mapping)
    test['kingdom_encoded'] = test['kingdom'].map(kingdom_mapping)
    # Fill missing values with -1
    test['kingdom_encoded'] = test['kingdom_encoded'].fillna(-1).astype(int)

# Target columns
target_cols = ['Avg_Temperature', 'Radiation', 'Rain_Amount', 'Wind_Speed', 'Wind_Direction']

# Create a submission DataFrame
model_predictions = pd.DataFrame({'ID': test['ID']})

# Calculate kingdom-specific statistics
print("Calculating kingdom statistics...")
kingdom_data = train.groupby('kingdom').agg({
    'Avg_Temperature': ['mean', 'median', 'std'],
    'Radiation': ['mean', 'median', 'std'],
    'Rain_Amount': ['mean', 'median', 'std'],
    'Wind_Speed': ['mean', 'median', 'std'],
    'Wind_Direction': ['mean', 'median', 'std']
})

# Flatten the multi-index columns
kingdom_data.columns = ['_'.join(col).strip() for col in kingdom_data.columns.values]
kingdom_data = kingdom_data.reset_index()

# Calculate month-day specific statistics (seasonal patterns)
monthday_data = train.groupby(['Month', 'Day']).agg({
    'Avg_Temperature': 'mean',
    'Radiation': 'mean',
    'Rain_Amount': 'mean', 
    'Wind_Speed': 'mean',
    'Wind_Direction': 'mean'
}).reset_index()

# Merge kingdom statistics to test data
print("Merging features...")
test = pd.merge(test, kingdom_data, on='kingdom', how='left')

# Merge month-day statistics to test data
test = pd.merge(test, monthday_data, on=['Month', 'Day'], how='left', 
                suffixes=('', '_monthday_mean'))

# Fill any missing values from the merges
for col in test.columns:
    if test[col].isna().any():
        if col.endswith('_mean') or col.endswith('_median'):
            # For statistic columns, fill with the average of that statistic
            test[col] = test[col].fillna(test[col].mean())
        elif col in target_cols:
            # For direct target columns, use global mean
            test[col] = test[col].fillna(train[col].mean())

# Define features based on what's available
print("Defining feature sets...")
base_features = ['Year', 'Month', 'Day', 'kingdom_encoded', 'DayOfYear', 'Season']


Preparing data...
Converting temperature units...
Creating features...
Calculating kingdom statistics...
Merging features...
Defining feature sets...


In [6]:
print("Preprocessing data...")
preprocessed_data = {}

for target in target_cols:
    print(f"Processing {target}...")
    features = base_features.copy()
    
    for stat in ['mean', 'median', 'std']:
        col_name = f"{target}_{stat}"
        if col_name in test.columns:
            features.append(col_name)
    
    col_name = f"{target}_monthday_mean"
    if col_name in test.columns:
        features.append(col_name)
    
    features = [f for f in features if f in train.columns and f in test.columns]
    
    X_train = train[features].fillna(0)
    y_train = train[target].fillna(train[target].mean())
    X_test = test[features].fillna(0)
    
    if y_train.isna().any():
        print(f"  Warning: Found {y_train.isna().sum()} NaN values in {target}. Filling with mean.")
        y_train = y_train.fillna(y_train.mean())
    
    preprocessed_data[target] = {'X_train': X_train, 'y_train': y_train, 'X_test': X_test}

print("Preprocessing complete.")

Preprocessing data...
Processing Avg_Temperature...
Processing Radiation...
Processing Rain_Amount...
Processing Wind_Speed...
Processing Wind_Direction...
Preprocessing complete.


In [9]:
# Function to train and predict with any model
def train_and_predict(model):
    print(f"Training {model.__class__.__name__} model and making predictions...")
    
    predictions_df = pd.DataFrame({'ID': test['ID']})
    for target, data in preprocessed_data.items():
        print(f"  Training {model.__class__.__name__} for {target}...")
        
        try:
            model.fit(data['X_train'], data['y_train'])
            predictions = model.predict(data['X_test'])
        except Exception as e:
            print(f"  Error training {model.__class__.__name__} for {target}: {str(e)}")
            predictions = test[f"{target}_mean"].values
        
        predictions_df[f"{target}"] = predictions
    
    print("Post-processing predictions...")
    for target in ['Radiation', 'Rain_Amount', 'Wind_Speed']:
        predictions_df[f"{target}"] = predictions_df[f"{target}"].clip(lower=0)

    predictions_df['Wind_Direction'] %= 360
    
    return predictions_df


In [10]:
# RandomForestRegressor
random_forest_model = RandomForestRegressor(n_estimators=100, max_depth=12, min_samples_split=5, min_samples_leaf=2, random_state=42, n_jobs=-1)
random_forest_predictions = train_and_predict(random_forest_model)

print("Saving submission file...")
random_forest_predictions.to_csv('submission 1.csv', index=False)
print("Submission file created successfully!")

Training RandomForestRegressor model and making predictions...
  Training RandomForestRegressor for Avg_Temperature...
  Training RandomForestRegressor for Radiation...
  Training RandomForestRegressor for Rain_Amount...
  Training RandomForestRegressor for Wind_Speed...
  Training RandomForestRegressor for Wind_Direction...
Post-processing predictions...
Saving submission file...
Submission file created successfully!


In [12]:
# LightGBM
lgbm_model = lgb.LGBMRegressor(
            n_estimators=200,
            learning_rate=0.05,
            num_leaves=31,
            subsample=0.8,
            colsample_bytree=0.8,
            min_child_samples=20,
            random_state=42,
            n_jobs=-1
)
lgbm_predictions = train_and_predict(lgbm_model)

print("Saving submission file...")
lgbm_predictions.to_csv('submission 2.csv', index=False)
print("Submission file created successfully!")

Training LGBMRegressor model and making predictions...
  Training LGBMRegressor for Avg_Temperature...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001195 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 344
[LightGBM] [Info] Number of data points in the train set: 84960, number of used features: 6
[LightGBM] [Info] Start training from score 26.340751
  Training LGBMRegressor for Radiation...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002880 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 344
[LightGBM] [Info] Number of data points in the train set: 84960, number of used features: 6
[LightGBM] [Info] Start training from score 20.338598
  Training LGBMRegressor for Rain_Amount...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the ove

In [16]:
# XGBoost
xgb_model = xgb.XGBRegressor(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=8,
        subsample=0.8,
        colsample_bytree=0.8,
        min_child_weight=3,
        objective='reg:squarederror',
        random_state=42,
        n_jobs=-1
)
xgb_predictions = train_and_predict(xgb_model)
print("Saving submission file...")
xgb_predictions.to_csv('submission 3.csv', index=False)
print("Submission file created successfully!")

Training XGBRegressor model and making predictions...
  Training XGBRegressor for Avg_Temperature...
  Training XGBRegressor for Radiation...
  Training XGBRegressor for Rain_Amount...
  Training XGBRegressor for Wind_Speed...
  Training XGBRegressor for Wind_Direction...
Post-processing predictions...
Saving submission file...
Submission file created successfully!


In [18]:
# GradientBoostingRegressor
gradient_boost_model = GradientBoostingRegressor(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=5,
        subsample=0.8,
        min_samples_leaf=20,
        random_state=42
)
gradient_boost_predictions = train_and_predict(gradient_boost_model)
print("Saving submission file...")
gradient_boost_predictions.to_csv('submission 5.csv', index=False)
print("Submission file created successfully!")

Training GradientBoostingRegressor model and making predictions...
  Training GradientBoostingRegressor for Avg_Temperature...
  Training GradientBoostingRegressor for Radiation...
  Training GradientBoostingRegressor for Rain_Amount...
  Training GradientBoostingRegressor for Wind_Speed...
  Training GradientBoostingRegressor for Wind_Direction...
Post-processing predictions...
Saving submission file...
Submission file created successfully!


In [25]:
# BaggingRegressor
bagging_model = BaggingRegressor(  # Changed from BaggingClassifier
        estimator=DecisionTreeRegressor(),  # Changed from DecisionTreeClassifier
        n_estimators=200,  
        max_samples=0.8,  
        max_features=0.8,  
        random_state=42,  
        n_jobs=-1  
)
bagging_predictions = train_and_predict(bagging_model)
print("Saving submission file...")
bagging_predictions.to_csv('submission 6.csv', index=False)
print("Submission file created successfully!")

Training BaggingRegressor model and making predictions...
  Training BaggingRegressor for Avg_Temperature...
  Training BaggingRegressor for Radiation...
  Training BaggingRegressor for Rain_Amount...
  Training BaggingRegressor for Wind_Speed...
  Training BaggingRegressor for Wind_Direction...
Post-processing predictions...
Saving submission file...
Submission file created successfully!


In [None]:
def smape(y_true, y_pred):
    denominator = np.abs(y_true) + np.abs(y_pred)
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0  # Handle division by zero
    return 200 * np.mean(diff)

rf_smapes = []

for target in target_cols:
    actual_values = test[target]  # Replace with the actual column for each target
    predicted_values = random_forest_predictions[target]  # Predictions for the target column
    
    # Calculate sMAPE for the current target
    smape_score = smape(actual_values, predicted_values)
    rf_smapes.append(smape_score)
    print(f"sMAPE for {target}: {smape_score}")

# Calculate mean sMAPE for all targets
mean_rf_smape = np.mean(rf_smapes)
print(f"Mean sMAPE across all targets: {mean_rf_smape}")

sMAPE for Avg_Temperature: 3.6305800561746935
sMAPE for Radiation: 9.536708837289076
sMAPE for Rain_Amount: 70.64783777298747
sMAPE for Wind_Speed: 18.365474823623227
sMAPE for Wind_Direction: 19.736307893407645
Mean sMAPE across all targets: 24.383381876696426


In [13]:
lgbm_smapes = []

for target in target_cols:
    actual_values = test[target]  # Replace with the actual column for each target
    predicted_values = lgbm_predictions[target]  # Predictions for the target column
    
    # Calculate sMAPE for the current target
    smape_score = smape(actual_values, predicted_values)
    lgbm_smapes.append(smape_score)
    print(f"sMAPE for {target}: {smape_score}")

# Calculate mean sMAPE for all targets
mean_lgbm_smape = np.mean(lgbm_smapes)
print(f"Mean sMAPE across all targets: {mean_lgbm_smape}")

sMAPE for Avg_Temperature: 3.8036742338691982
sMAPE for Radiation: 5.376964820849653
sMAPE for Rain_Amount: 42.1519458841763
sMAPE for Wind_Speed: 16.280610662168634
sMAPE for Wind_Direction: 14.090709629811476
Mean sMAPE across all targets: 16.34078104617505


In [17]:
# For XGBoost model
xgb_smapes = []

for target in target_cols:
    actual_values = test[target]  # Actual values from the test set for each target
    predicted_values = xgb_predictions[target]  # Predictions from the xgb model
    
    # Calculate sMAPE for the current target
    smape_score = smape(actual_values, predicted_values)
    xgb_smapes.append(smape_score)
    print(f"sMAPE for XGBoost {target}: {smape_score}")

# Calculate mean sMAPE for XGBoost
mean_xgb_smape = np.mean(xgb_smapes)
print(f"Mean sMAPE across all targets for XGBoost: {mean_xgb_smape}")

sMAPE for XGBoost Avg_Temperature: 4.1144389268921815
sMAPE for XGBoost Radiation: 8.598202602659095
sMAPE for XGBoost Rain_Amount: 67.18132755632526
sMAPE for XGBoost Wind_Speed: 19.218896985454315
sMAPE for XGBoost Wind_Direction: 18.660414124513764
Mean sMAPE across all targets for XGBoost: 23.554656039168922


In [19]:
# For Gradient Boosting model
gb_smapes = []

for target in target_cols:
    actual_values = test[target]  # Actual values from the test set for each target
    predicted_values = gradient_boost_predictions[target]  # Predictions from the gradient boosting model
    
    # Calculate sMAPE for the current target
    smape_score = smape(actual_values, predicted_values)
    gb_smapes.append(smape_score)
    print(f"sMAPE for Gradient Boosting {target}: {smape_score}")

# Calculate mean sMAPE for Gradient Boosting
mean_gb_smape = np.mean(gb_smapes)
print(f"Mean sMAPE across all targets for Gradient Boosting: {mean_gb_smape}")


sMAPE for Gradient Boosting Avg_Temperature: 3.680698875864695
sMAPE for Gradient Boosting Radiation: 5.93320036555565
sMAPE for Gradient Boosting Rain_Amount: 46.595133569817676
sMAPE for Gradient Boosting Wind_Speed: 17.14750097301889
sMAPE for Gradient Boosting Wind_Direction: 13.97850797705458
Mean sMAPE across all targets for Gradient Boosting: 17.467008352262297


In [26]:
# For Bagging model
bagging_smapes = []

for target in target_cols:
    actual_values = test[target]  # Actual values from the test set for each target
    predicted_values = bagging_predictions[target]  # Predictions from the bagging model
    
    # Calculate sMAPE for the current target
    smape_score = smape(actual_values, predicted_values)
    bagging_smapes.append(smape_score)
    print(f"sMAPE for Bagging {target}: {smape_score}")

# Calculate mean sMAPE for Bagging
mean_bagging_smape = np.mean(bagging_smapes)
print(f"Mean sMAPE across all targets for Bagging: {mean_bagging_smape}")

sMAPE for Bagging Avg_Temperature: 2.8276818848162777
sMAPE for Bagging Radiation: 5.979694847130214
sMAPE for Bagging Rain_Amount: 43.18520899702037
sMAPE for Bagging Wind_Speed: 14.296864634498768
sMAPE for Bagging Wind_Direction: 12.6984132895259
Mean sMAPE across all targets for Bagging: 15.797572730598304
