In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# -----------------------------
# 1️⃣ Load cleaned data
# -----------------------------
df_scores = pd.read_excel(r"C:\Users\LOQ\Downloads\projet_power_pi\scores_2019_cleaned_dataset.xlsx")
df_athletes = pd.concat([
    pd.read_excel(r"C:\Users\LOQ\Downloads\projet_power_pi\athletes_women_2019_cleaned (1).xlsx"),
    pd.read_excel(r"C:\Users\LOQ\Downloads\projet_power_pi\athletes_men_2019_cleaned (1).xlsx")
])

# -----------------------------
# 2️⃣ Merge datasets
# -----------------------------
df = df_scores.merge(df_athletes, on='competitorid', how='left')

# -----------------------------
# 3️⃣ Compute past performance metrics
# -----------------------------
past_perf = df.groupby('competitorid').agg(
    avg_rank=('rank', 'mean'),
    avg_points=('points', 'mean'),
    top10_finishes=('rank', lambda x: (x <= 10).sum())
).reset_index()

df = df.merge(past_perf, on='competitorid', how='left')

# -----------------------------
# 4️⃣ Select features & target
# -----------------------------
# 🎯 Target: predict points (performance per workout)
target = 'points'

# Features describing athlete characteristics and experience
features = ['age', 'weight', 'height', 'gender', 'division', 'avg_rank', 'avg_points']

df = df.dropna(subset=features + [target])  # Remove rows with missing values

X = df[features]
y = df[target]

# -----------------------------
# 5️⃣ Preprocessing
# -----------------------------
numeric_features = ['age', 'weight', 'height', 'avg_rank', 'avg_points']
categorical_features = ['gender', 'division']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

# -----------------------------
# 6️⃣ Train-test split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# -----------------------------
# 7️⃣ Random Forest Pipeline
# -----------------------------
model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(
        n_estimators=200,
        random_state=42,
        max_depth=12
    ))
])

# -----------------------------
# 8️⃣ Train model
# -----------------------------
model.fit(X_train, y_train)

# -----------------------------
# 9️⃣ Evaluate model
# -----------------------------
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R² Score: {r2:.4f}")

# -----------------------------
# 🔟 Predict new workout performance
# -----------------------------
new_athletes = pd.DataFrame([
    {'age': 28, 'weight': 70, 'height': 1.75, 'gender': 'MALE', 'division': 'Individual', 'avg_rank': 15, 'avg_points': 220},
    {'age': 32, 'weight': 60, 'height': 1.65, 'gender': 'FEMALE', 'division': 'Individual', 'avg_rank': 8, 'avg_points': 250},
    {'age': 40, 'weight': 80, 'height': 1.80, 'gender': 'MALE', 'division': 'Team', 'avg_rank': 30, 'avg_points': 180},
    {'age': 25, 'weight': 55, 'height': 1.60, 'gender': 'FEMALE', 'division': 'Team', 'avg_rank': 20, 'avg_points': 190}
])

predicted_points = model.predict(new_athletes)

for athlete, pred in zip(new_athletes.to_dict(orient='records'), predicted_points):
    print(f"Athlete: {athlete} --> Predicted workout points: {pred:.2f}")


Mean Squared Error (MSE): 0.01677846712945972
R² Score: 0.9922304203903664
Athlete: {'age': 28, 'weight': 70, 'height': 1.75, 'gender': 'MALE', 'division': 'Individual', 'avg_points': 220} --> Predicted top 10 finishes: 9
Athlete: {'age': 32, 'weight': 60, 'height': 1.65, 'gender': 'FEMALE', 'division': 'Individual', 'avg_points': 210} --> Predicted top 10 finishes: 11
Athlete: {'age': 40, 'weight': 80, 'height': 1.8, 'gender': 'MALE', 'division': 'Team', 'avg_points': 180} --> Predicted top 10 finishes: 10
Athlete: {'age': 25, 'weight': 55, 'height': 1.6, 'gender': 'FEMALE', 'division': 'Team', 'avg_points': 190} --> Predicted top 10 finishes: 10


In [2]:
import joblib

# After training your pipeline model
joblib.dump(model, "athlete_model.pkl")
print("Model saved!")


Model saved!


In [3]:
pip install flask


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: C:\Users\LOQ\Desktop\Summer Internship\venv310\Scripts\python.exe -m pip install --upgrade pip
