In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, roc_auc_score
from scipy.stats import randint

# === LOAD YOUR DATA ===
df = pd.read_csv(r"C:\Users\HP\Downloads\data.csv")

# === FEATURE ENGINEERING ===
df['Workload_Stress_Index'] = df['Training_Hours_Per_Week'] * df['Stress_Level_Score']
df['Sleep_Efficiency'] = df['Sleep_Hours_Per_Night'] / df['Training_Hours_Per_Week']
df['Recovery_Score'] = (
    0.4 * df['Sleep_Hours_Per_Night'] +
    0.4 * (df['Nutrition_Quality_Score'] / 100) +
    0.2 * df['Warmup_Routine_Adherence']
)
df['Injury_Risk_Score'] = (
    df['Previous_Injury_Count'] * 2 -
    df['Hamstring_Flexibility'] * 0.1 -
    df['Balance_Test_Score'] * 0.1 +
    df['Stress_Level_Score'] * 0.2
)

# === ACWR SIMULATION ===
n_weeks = 4
df['player_id'] = range(len(df))
ts_data = []

for player_id, (_, row) in zip(range(len(df)), df.iterrows()):
    base_load = row['Training_Hours_Per_Week']
    for week in range(1, n_weeks + 1):
        fluctuation = np.random.normal(loc=1.0, scale=0.1)
        session_load = base_load * fluctuation
        ts_data.append({
            'player_id': player_id,
            'week': week,
            'session_load': session_load
        })

df_sessions = pd.DataFrame(ts_data)
df_sessions.sort_values(by=['player_id', 'week'], inplace=True)

df_sessions['acute_load'] = df_sessions.groupby('player_id')['session_load']\
    .transform(lambda x: x.rolling(window=1, min_periods=1).mean())
df_sessions['chronic_load'] = df_sessions.groupby('player_id')['session_load']\
    .transform(lambda x: x.rolling(window=3, min_periods=1).mean())
df_sessions['ACWR'] = df_sessions['acute_load'] / df_sessions['chronic_load']
df_sessions['ACWR'].replace([np.inf, -np.inf], np.nan, inplace=True)
df_sessions['ACWR'].fillna(0, inplace=True)

last_week_acwr = df_sessions[df_sessions['week'] == n_weeks][['player_id', 'ACWR']].reset_index(drop=True)

# Merge ACWR into main dataset
df_model = df.merge(last_week_acwr, on='player_id', how='left')

# === FEATURES & TARGET ===
features = [
    'Training_Hours_Per_Week',
    'Stress_Level_Score',
    'Workload_Stress_Index',
    'Sleep_Efficiency',
    'Recovery_Score',
    'Injury_Risk_Score',
    'ACWR'
]
target = 'Injury_Next_Season'

X = df_model[features]
y = df_model[target]

# === SPLIT ===
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# === HYPERPARAMETER GRID ===
param_dist = {
    'n_estimators': randint(50, 300),
    'max_depth': randint(3, 20),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 10),
    'max_features': ['auto', 'sqrt', 'log2'],
    'bootstrap': [True, False]
}

# === RANDOMIZED SEARCH ===
random_search = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_distributions=param_dist,
    n_iter=50,
    cv=5,
    scoring='f1',
    verbose=1,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

# === EVALUATE BEST MODEL ===
best_rf = random_search.best_estimator_
y_pred_best = best_rf.predict(X_test)
y_proba_best = best_rf.predict_proba(X_test)[:, 1]

print("Best Parameters:")
print(random_search.best_params_)

print("\nClassification Report:")
print(classification_report(y_test, y_pred_best))

print(f"\nROC-AUC Score: {roc_auc_score(y_test, y_proba_best):.3f}")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_sessions['ACWR'].replace([np.inf, -np.inf], np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_sessions['ACWR'].fillna(0, inplace=True)


Fitting 5 folds for each of 50 candidates, totalling 250 fits


95 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
34 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\HP\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\HP\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
    ~~~~~~~~~~~~~~~~~~~~~~~~~~^^
  File "c:\Users\HP\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 436, in _validate_params
    validate_parameter_const

Best Parameters:
{'bootstrap': True, 'max_depth': 9, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 109}

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.91      0.89        80
           1       0.91      0.86      0.88        80

    accuracy                           0.89       160
   macro avg       0.89      0.89      0.89       160
weighted avg       0.89      0.89      0.89       160


ROC-AUC Score: 0.967
