In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

#datasets
print("Loading datasets...")
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
interactions = pd.read_csv('interactions.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"Interactions shape: {interactions.shape}")

# Convertdates to datetime (DD-MM-YYYY format)
train['service_date'] = pd.to_datetime(train['service_date'], format='%d-%m-%Y')
test['service_date'] = pd.to_datetime(test['service_date'], format='%d-%m-%Y')
interactions['service_date'] = pd.to_datetime(interactions['service_date'], format='%d-%m-%Y')
interactions['interaction_date'] = pd.to_datetime(interactions['interaction_date'], format='%d-%m-%Y')

def create_features(df, interactions_df, is_train=True):
    """
    Create features for prediction 15 days before service date
    """
    print("\nCreating features...")
    features_list = []

    for idx, row in df.iterrows():
        if idx % 1000 == 0:
            print(f"Processing row {idx}/{len(df)}")

        service_date = row['service_date']
        origin_hub = row['origin_hub_id']
        dest_hub = row['destination_hub_id']

        # Calculate cutoff date (15 days before service)
        cutoff_date = service_date - timedelta(days=15)

        # Filter interactions for this specific service up to cutoff date
        service_interactions = interactions_df[
            (interactions_df['service_date'] == service_date) &
            (interactions_df['origin_hub_id'] == origin_hub) &
            (interactions_df['destination_hub_id'] == dest_hub) &
            (interactions_df['interaction_date'] <= cutoff_date)
        ].sort_values('interaction_date')

        # Initialize feature dictionary
        feat = {
            'service_date': service_date,
            'origin_hub_id': origin_hub,
            'destination_hub_id': dest_hub
        }

        if not is_train:
            feat['service_key'] = row['service_key']

        # Basic features
        if len(service_interactions) > 0:
            latest = service_interactions.iloc[-1]

            # Latest cumulative values at cutoff
            feat['latest_commitments'] = latest['cumulative_commitments']
            feat['latest_interest'] = latest['cumulative_interest_signals']
            feat['days_before_at_cutoff'] = latest['days_before_service']

            # Maximum values seen
            feat['max_commitments'] = service_interactions['cumulative_commitments'].max()
            feat['max_interest'] = service_interactions['cumulative_interest_signals'].max()

            # Velocity features (last 7 days before cutoff)
            recent_interactions = service_interactions[
                service_interactions['interaction_date'] >= (cutoff_date - timedelta(days=7))
            ]

            if len(recent_interactions) > 1:
                commit_change = (recent_interactions.iloc[-1]['cumulative_commitments'] -
                                recent_interactions.iloc[0]['cumulative_commitments'])
                interest_change = (recent_interactions.iloc[-1]['cumulative_interest_signals'] -
                                  recent_interactions.iloc[0]['cumulative_interest_signals'])
                feat['commitment_velocity_7d'] = commit_change / 7
                feat['interest_velocity_7d'] = interest_change / 7
            else:
                feat['commitment_velocity_7d'] = 0
                feat['interest_velocity_7d'] = 0

            # Ratio features
            feat['commitment_to_interest_ratio'] = (feat['latest_commitments'] /
                                                    (feat['latest_interest'] + 1))

            # Activity features
            feat['num_interactions'] = len(service_interactions)
            feat['days_with_activity'] = service_interactions['interaction_date'].nunique()

            # Categorical features
            feat['origin_region'] = latest['origin_region']
            feat['destination_region'] = latest['destination_region']
            feat['origin_hub_tier'] = latest['origin_hub_tier']
            feat['destination_hub_tier'] = latest['destination_hub_tier']

        else:
            # No interaction data available
            feat['latest_commitments'] = 0
            feat['latest_interest'] = 0
            feat['days_before_at_cutoff'] = 15
            feat['max_commitments'] = 0
            feat['max_interest'] = 0
            feat['commitment_velocity_7d'] = 0
            feat['interest_velocity_7d'] = 0
            feat['commitment_to_interest_ratio'] = 0
            feat['num_interactions'] = 0
            feat['days_with_activity'] = 0
            feat['origin_region'] = 'unknown'
            feat['destination_region'] = 'unknown'
            feat['origin_hub_tier'] = 'unknown'
            feat['destination_hub_tier'] = 'unknown'

        # Temporal features
        feat['day_of_week'] = service_date.dayofweek
        feat['month'] = service_date.month
        feat['day_of_month'] = service_date.day
        feat['is_weekend'] = 1 if service_date.dayofweek >= 5 else 0
        feat['quarter'] = service_date.quarter

        # Hub combination features
        feat['route'] = f"{origin_hub}_{dest_hub}"

        if is_train:
            feat['target'] = row['final_service_units']

        features_list.append(feat)

    return pd.DataFrame(features_list)

# Create features
print("\n" + "="*50)
print("FEATURE ENGINEERING")
print("="*50)

train_features = create_features(train, interactions, is_train=True)
test_features = create_features(test, interactions, is_train=False)

print("\nTrain features shape:", train_features.shape)
print("Test features shape:", test_features.shape)

# Encode categorical variables
print("\n" + "="*50)
print("ENCODING CATEGORICAL FEATURES")
print("="*50)

categorical_cols = ['origin_region', 'destination_region', 'origin_hub_tier',
                    'destination_hub_tier', 'route', 'origin_hub_id', 'destination_hub_id']

label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    # Fit on combined data to ensure consistent encoding
    combined = pd.concat([train_features[col].astype(str),
                         test_features[col].astype(str)])
    le.fit(combined)
    train_features[col + '_encoded'] = le.transform(train_features[col].astype(str))
    test_features[col + '_encoded'] = le.transform(test_features[col].astype(str))
    label_encoders[col] = le
    print(f"Encoded {col}: {len(le.classes_)} unique values")

# Select features for modeling
feature_cols = [
    'latest_commitments', 'latest_interest', 'max_commitments', 'max_interest',
    'commitment_velocity_7d', 'interest_velocity_7d', 'commitment_to_interest_ratio',
    'num_interactions', 'days_with_activity', 'days_before_at_cutoff',
    'day_of_week', 'month', 'day_of_month', 'is_weekend', 'quarter',
    'origin_region_encoded', 'destination_region_encoded',
    'origin_hub_tier_encoded', 'destination_hub_tier_encoded',
    'route_encoded', 'origin_hub_id_encoded', 'destination_hub_id_encoded'
]

X_train_full = train_features[feature_cols]
y_train_full = train_features['target']
X_test = test_features[feature_cols]

print(f"\nFeature matrix shape: {X_train_full.shape}")
print(f"Target shape: {y_train_full.shape}")

# Split for validation
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.2, random_state=42
)

print(f"Train set: {X_train.shape}")
print(f"Validation set: {X_val.shape}")

# Train models
print("\n" + "="*50)
print("MODEL TRAINING")
print("="*50)


print("\n1. Training Gradient Boosting Regressor...")
gb_model = GradientBoostingRegressor(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=5,
    min_samples_split=10,
    min_samples_leaf=5,
    random_state=42,
    verbose=0
)
gb_model.fit(X_train, y_train)


print("2. Training Random Forest Regressor...")
rf_model = RandomForestRegressor(
    n_estimators=200,
    max_depth=15,
    min_samples_split=10,
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1,
    verbose=0
)
rf_model.fit(X_train, y_train)


print("3. Training Ridge Regression...")
ridge_model = Ridge(alpha=1.0, random_state=42)
ridge_model.fit(X_train, y_train)


print("\n" + "="*50)
print("VALIDATION RESULTS")
print("="*50)

gb_val_pred = gb_model.predict(X_val)
rf_val_pred = rf_model.predict(X_val)
ridge_val_pred = ridge_model.predict(X_val)

ensemble_val_pred = (0.5 * gb_val_pred + 0.3 * rf_val_pred + 0.2 * ridge_val_pred)

print("\nGradient Boosting:")
print(f"  MAE: {mean_absolute_error(y_val, gb_val_pred):.4f}")
print(f"  RMSE: {np.sqrt(mean_squared_error(y_val, gb_val_pred)):.4f}")
print(f"  R2: {r2_score(y_val, gb_val_pred):.4f}")

print("\nRandom Forest:")
print(f"  MAE: {mean_absolute_error(y_val, rf_val_pred):.4f}")
print(f"  RMSE: {np.sqrt(mean_squared_error(y_val, rf_val_pred)):.4f}")
print(f"  R2: {r2_score(y_val, rf_val_pred):.4f}")

print("\nRidge Regression:")
print(f"  MAE: {mean_absolute_error(y_val, ridge_val_pred):.4f}")
print(f"  RMSE: {np.sqrt(mean_squared_error(y_val, ridge_val_pred)):.4f}")
print(f"  R2: {r2_score(y_val, ridge_val_pred):.4f}")

print("\nEnsemble (50% GB + 30% RF + 20% Ridge):")
print(f"  MAE: {mean_absolute_error(y_val, ensemble_val_pred):.4f}")
print(f"  RMSE: {np.sqrt(mean_squared_error(y_val, ensemble_val_pred)):.4f}")
print(f"  R2: {r2_score(y_val, ensemble_val_pred):.4f}")


print("\n" + "="*50)
print("RETRAINING ON FULL DATASET")
print("="*50)

gb_model.fit(X_train_full, y_train_full)
rf_model.fit(X_train_full, y_train_full)
ridge_model.fit(X_train_full, y_train_full)


print("\n" + "="*50)
print("GENERATING TEST PREDICTIONS")
print("="*50)

gb_test_pred = gb_model.predict(X_test)
rf_test_pred = rf_model.predict(X_test)
ridge_test_pred = ridge_model.predict(X_test)


ensemble_test_pred = (0.5 * gb_test_pred + 0.3 * rf_test_pred + 0.2 * ridge_test_pred)


ensemble_test_pred = np.maximum(0, ensemble_test_pred)

print(f"\nPrediction statistics:")
print(f"  Min: {ensemble_test_pred.min():.2f}")
print(f"  Max: {ensemble_test_pred.max():.2f}")
print(f"  Mean: {ensemble_test_pred.mean():.2f}")
print(f"  Median: {np.median(ensemble_test_pred):.2f}")


submission = pd.DataFrame({
    'service_key': test_features['service_key'],
    'final_service_units': ensemble_test_pred
})

submission.to_csv('submission.csv', index=False)

print("\n" + "="*50)
print("SUBMISSION FILE CREATED")
print("="*50)
print(f"\nFile: submission.csv")
print(f"Rows: {len(submission)}")
print("\nFirst 10 predictions:")
print(submission.head(10))
print("\nâœ“ Training complete! Ready for submission.")