# 🏃 Half Marathon Time Prediction - Training Pipeline
## Professional ML Pipeline with Digital Ocean Spaces Integration

In [None]:
# Install required packages
# !pip install boto3 pandas numpy scikit-learn xgboost joblib python-dotenv

In [1]:
import pandas as pd
import numpy as np
import boto3
import joblib
import os
from io import BytesIO, StringIO
from datetime import datetime
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings('ignore')

print("✅ Libraries imported successfully")

✅ Libraries imported successfully


## 🔐 Configuration & Digital Ocean Connection

In [2]:
# Digital Ocean Spaces Configuration
DO_SPACES_KEY = os.getenv('DO_SPACES_KEY', 'your_access_key')
DO_SPACES_SECRET = os.getenv('DO_SPACES_SECRET', 'your_secret_key')
DO_SPACES_REGION = os.getenv('DO_SPACES_REGION', 'fra1')
DO_SPACES_BUCKET = os.getenv('DO_SPACES_BUCKET', 'halfmarathon-ml')
DO_SPACES_ENDPOINT = f'https://{DO_SPACES_REGION}.digitaloceanspaces.com'

# Initialize S3 client (Digital Ocean Spaces is S3-compatible)
s3_client = boto3.client(
    's3',
    region_name=DO_SPACES_REGION,
    endpoint_url=DO_SPACES_ENDPOINT,
    aws_access_key_id=DO_SPACES_KEY,
    aws_secret_access_key=DO_SPACES_SECRET
)

print(f"✅ Connected to Digital Ocean Spaces: {DO_SPACES_BUCKET}")

✅ Connected to Digital Ocean Spaces: half-marathon-app


## 📥 Data Loading from Digital Ocean Spaces

In [None]:
def load_csv_from_spaces(filename):
    """Load CSV file from Digital Ocean Spaces"""
    try:
        obj = s3_client.get_object(Bucket=DO_SPACES_BUCKET, Key=filename)  # ← BEZ data/
        df = pd.read_csv(StringIO(obj['Body'].read().decode('utf-8')), sep=';')
        print(f"✅ Loaded {filename}: {len(df)} rows, {len(df.columns)} columns")
        return df
    except Exception as e:
        print(f"❌ Error loading {filename}: {e}")
        return None

# Load datasets
df_2023 = load_csv_from_spaces('halfmarathon_wroclaw_2023__final.csv')
df_2024 = load_csv_from_spaces('halfmarathon_wroclaw_2024__final.csv')

# Combine datasets
df = pd.concat([df_2023, df_2024], ignore_index=True)
print(f"\n📊 Total combined dataset: {len(df)} rows")
df.head()

## 🧹 Data Cleaning & Feature Engineering

In [None]:
def convert_time_to_seconds(time_str):
    """Convert time format HH:MM:SS to seconds"""
    if pd.isnull(time_str) or time_str in ['DNS', 'DNF', '']:
        return None
    try:
        parts = str(time_str).split(':')
        if len(parts) == 3:
            return int(parts[0]) * 3600 + int(parts[1]) * 60 + int(parts[2])
        return None
    except:
        return None

def calculate_age(birth_year):
    """Calculate age from birth year"""
    current_year = datetime.now().year
    try:
        return current_year - int(birth_year)
    except:
        return None

print("🔄 Starting data cleaning...")

# Convert time columns to seconds
time_columns = ['5 km Czas', '10 km Czas', '15 km Czas', '20 km Czas', 'Czas']
for col in time_columns:
    if col in df.columns:
        df[f'{col}_seconds'] = df[col].apply(convert_time_to_seconds)

# Calculate age
if 'Rocznik' in df.columns:
    df['Wiek'] = df['Rocznik'].apply(calculate_age)

# Encode gender
if 'Płeć' in df.columns:
    le_gender = LabelEncoder()
    df['Płeć_encoded'] = le_gender.fit_transform(df['Płeć'].fillna('M'))
    joblib.dump(le_gender, 'gender_encoder.pkl')

# Remove rows with missing target variable
df_clean = df[df['Czas_seconds'].notna()].copy()

# Remove outliers (times > 4 hours or < 1 hour)
df_clean = df_clean[
    (df_clean['Czas_seconds'] >= 3600) & 
    (df_clean['Czas_seconds'] <= 14400)
]

print(f"✅ Data cleaning complete: {len(df_clean)} valid rows")
print(f"📉 Removed {len(df) - len(df_clean)} invalid/outlier rows")

## 🎯 Feature Selection & Engineering

In [None]:
# Select features for modeling
feature_cols = [
    'Płeć_encoded',
    'Wiek',
    '5 km Czas_seconds',
    '5 km Tempo',
]

# Add optional features if available
optional_features = ['10 km Tempo', '15 km Tempo', 'Tempo Stabilność']
for feat in optional_features:
    if feat in df_clean.columns:
        feature_cols.append(feat)

target_col = 'Czas_seconds'

# Create feature matrix
df_model = df_clean[feature_cols + [target_col]].dropna()

X = df_model[feature_cols]
y = df_model[target_col]

print(f"📊 Feature matrix shape: {X.shape}")
print(f"📋 Features used: {feature_cols}")
print(f"\n📈 Target variable statistics:")
print(f"   Mean: {y.mean()/60:.2f} minutes")
print(f"   Median: {y.median()/60:.2f} minutes")
print(f"   Std: {y.std()/60:.2f} minutes")

## 🔀 Train-Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"✅ Train set: {X_train.shape[0]} samples")
print(f"✅ Test set: {X_test.shape[0]} samples")

## 🤖 Model Training & Hyperparameter Tuning

In [None]:
print("🎯 Training XGBoost model with hyperparameter tuning...\n")

# Define parameter grid for XGBoost
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [4, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.8, 0.9, 1.0],
}

# Initialize XGBoost
xgb_model = XGBRegressor(
    random_state=42,
    n_jobs=-1,
    objective='reg:squarederror'
)

# Grid search with cross-validation
grid_search = GridSearchCV(
    xgb_model,
    param_grid,
    cv=5,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

print(f"\n✅ Best parameters: {grid_search.best_params_}")
print(f"✅ Best CV MAE: {-grid_search.best_score_/60:.2f} minutes")

# Use best model
best_model = grid_search.best_estimator_

## 📊 Model Evaluation

In [None]:
# Predictions
y_pred_train = best_model.predict(X_train)
y_pred_test = best_model.predict(X_test)

# Calculate metrics
def evaluate_model(y_true, y_pred, dataset_name):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    
    print(f"\n📊 {dataset_name} Metrics:")
    print(f"   MAE: {mae/60:.2f} minutes ({mae:.0f} seconds)")
    print(f"   RMSE: {rmse/60:.2f} minutes ({rmse:.0f} seconds)")
    print(f"   R² Score: {r2:.4f}")
    
    return {'mae': mae, 'rmse': rmse, 'r2': r2}

train_metrics = evaluate_model(y_train, y_pred_train, "Training Set")
test_metrics = evaluate_model(y_test, y_pred_test, "Test Set")

# Feature importance
print("\n🎯 Feature Importance:")
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': best_model.feature_importances_
}).sort_values('importance', ascending=False)
print(feature_importance.to_string(index=False))

## 💾 Save Model Locally & Upload to Digital Ocean Spaces

In [None]:
# Create model metadata
model_metadata = {
    'model_type': 'XGBoost',
    'version': datetime.now().strftime('%Y%m%d_%H%M%S'),
    'features': feature_cols,
    'train_samples': len(X_train),
    'test_samples': len(X_test),
    'metrics': {
        'train': train_metrics,
        'test': test_metrics
    },
    'best_params': grid_search.best_params_
}

# Save model locally
model_filename = f"halfmarathon_model_{model_metadata['version']}.pkl"
metadata_filename = f"model_metadata_{model_metadata['version']}.pkl"

joblib.dump(best_model, model_filename)
joblib.dump(model_metadata, metadata_filename)

print(f"✅ Model saved locally: {model_filename}")
print(f"✅ Metadata saved locally: {metadata_filename}")

# Upload to Digital Ocean Spaces
def upload_to_spaces(local_file, spaces_key):
    try:
        s3_client.upload_file(
            local_file,
            DO_SPACES_BUCKET,
            f'models/{spaces_key}',
            ExtraArgs={'ACL': 'private'}
        )
        print(f"✅ Uploaded to Spaces: models/{spaces_key}")
    except Exception as e:
        print(f"❌ Upload error: {e}")

upload_to_spaces(model_filename, model_filename)
upload_to_spaces(metadata_filename, metadata_filename)
upload_to_spaces('gender_encoder.pkl', 'gender_encoder.pkl')

# Also save as "latest" version
upload_to_spaces(model_filename, 'halfmarathon_model_latest.pkl')
upload_to_spaces(metadata_filename, 'model_metadata_latest.pkl')

print("\n🎉 Training pipeline completed successfully!")

## 🧪 Test Predictions

In [None]:
# Test with sample data
test_cases = [
    {'Płeć_encoded': 1, 'Wiek': 30, '5 km Czas_seconds': 1200, '5 km Tempo': 4.0},
    {'Płeć_encoded': 0, 'Wiek': 25, '5 km Czas_seconds': 1500, '5 km Tempo': 5.0},
    {'Płeć_encoded': 1, 'Wiek': 45, '5 km Czas_seconds': 1350, '5 km Tempo': 4.5},
]

print("🧪 Test Predictions:\n")
for i, case in enumerate(test_cases, 1):
    # Create feature vector with proper column names
    test_df = pd.DataFrame([case])
    # Add missing columns with default values
    for col in feature_cols:
        if col not in test_df.columns:
            test_df[col] = 0
    test_df = test_df[feature_cols]
    
    prediction_seconds = best_model.predict(test_df)[0]
    prediction_time = f"{int(prediction_seconds//3600)}:{int((prediction_seconds%3600)//60):02d}:{int(prediction_seconds%60):02d}"
    
    gender = 'Mężczyzna' if case['Płeć_encoded'] == 1 else 'Kobieta'
    print(f"Test case {i}:")
    print(f"   {gender}, {case['Wiek']} lat, 5km: {case['5 km Czas_seconds']//60}:{case['5 km Czas_seconds']%60:02d}")
    print(f"   Predicted half-marathon time: {prediction_time}\n")