In [2]:
import pandas as pd

df = pd.read_csv(r"C:\Users\HP\Downloads\data.csv")

In [3]:
df['Workload_Stress_Index'] = df['Training_Hours_Per_Week'] * df['Stress_Level_Score']
df['Sleep_Efficiency'] = df['Sleep_Hours_Per_Night'] / df['Training_Hours_Per_Week']
df['Recovery_Score'] = (
    0.4 * df['Sleep_Hours_Per_Night'] +
    0.4 * (df['Nutrition_Quality_Score'] / 100) +
    0.2 * df['Warmup_Routine_Adherence']
)
df['Injury_Risk_Score'] = (
    df['Previous_Injury_Count'] * 2 -
    df['Hamstring_Flexibility'] * 0.1 -
    df['Balance_Test_Score'] * 0.1 +
    df['Stress_Level_Score'] * 0.2
)

In [4]:
df.head()

Unnamed: 0,Age,Height_cm,Weight_kg,Position,Training_Hours_Per_Week,Matches_Played_Past_Season,Previous_Injury_Count,Knee_Strength_Score,Hamstring_Flexibility,Reaction_Time_ms,...,Sleep_Hours_Per_Night,Stress_Level_Score,Nutrition_Quality_Score,Warmup_Routine_Adherence,Injury_Next_Season,BMI,Workload_Stress_Index,Sleep_Efficiency,Recovery_Score,Injury_Risk_Score
0,22,173,64,Midfielder,11.575308,36,1,77.460279,79.115738,284.487853,...,8.238293,46.616415,81.472206,1,0,21.383942,539.599365,0.711713,3.821206,-5.709538
1,18,170,67,Midfielder,12.275869,37,2,72.634442,82.541688,250.579249,...,8.983737,49.368037,81.056677,1,0,23.183391,606.035574,0.731821,4.117722,-3.109969
2,22,186,75,Forward,12.254896,12,2,77.06449,75.943631,269.119918,...,7.229193,43.132808,64.877457,0,1,21.678807,528.588062,0.589902,3.151187,-3.31187
3,20,172,62,Defender,9.006678,11,1,82.810232,73.878324,226.376412,...,7.681029,51.528529,89.824744,1,0,20.957274,464.10085,0.852815,3.63171,-3.841316
4,18,172,94,Midfielder,12.683668,10,2,76.772859,76.653043,229.021042,...,6.728091,52.379718,71.569197,0,1,31.773932,664.366932,0.530453,2.977513,-1.501877


In [5]:
import numpy as np

n_weeks = 4
players = df.copy()
player_ids = range(len(players))  # simulate player IDs

ts_data = []

for player_id, (_, row) in zip(player_ids, players.iterrows()):
    base_load = row['Training_Hours_Per_Week']
    for week in range(1, n_weeks + 1):
        fluctuation = np.random.normal(loc=1.0, scale=0.1)  # small random change
        session_load = base_load * fluctuation
        ts_data.append({
            'player_id': player_id,
            'week': week,
            'session_load': session_load
        })

df_sessions = pd.DataFrame(ts_data)
df_sessions.sort_values(by=['player_id', 'week'], inplace=True)

acute_window = 1       # Acute load: current week
chronic_window = 3     # Chronic load: past 3 weeks

df_sessions['acute_load'] = df_sessions.groupby('player_id')['session_load']\
    .transform(lambda x: x.rolling(window=acute_window, min_periods=1).mean())

df_sessions['chronic_load'] = df_sessions.groupby('player_id')['session_load']\
    .transform(lambda x: x.rolling(window=chronic_window, min_periods=1).mean())

df_sessions['ACWR'] = df_sessions['acute_load'] / df_sessions['chronic_load']
df_sessions['ACWR'].replace([np.inf, -np.inf], np.nan, inplace=True)
df_sessions['ACWR'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_sessions['ACWR'].replace([np.inf, -np.inf], np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_sessions['ACWR'].fillna(0, inplace=True)


In [6]:
last_week_acwr = df_sessions[df_sessions['week'] == n_weeks][['player_id', 'ACWR']].reset_index(drop=True)

df_model = df.copy()

df_model['player_id'] = range(len(df_model))

df_model = df_model.merge(last_week_acwr, on='player_id', how='left')


In [7]:
df_model['player_id'] = range(len(df_model))
df_model = df_model.merge(last_week_acwr, on='player_id', how='left')

In [8]:
features = [
    'Training_Hours_Per_Week',
    'Stress_Level_Score',
    'Workload_Stress_Index',
    'Sleep_Efficiency',
    'Recovery_Score',
    'Injury_Risk_Score',
    'ACWR_x',
    'ACWR_y'
]
target = 'Injury_Next_Season'

In [9]:
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

X = df_model[features]
y = df_model[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

In [10]:
df_model.head()

Unnamed: 0,Age,Height_cm,Weight_kg,Position,Training_Hours_Per_Week,Matches_Played_Past_Season,Previous_Injury_Count,Knee_Strength_Score,Hamstring_Flexibility,Reaction_Time_ms,...,Warmup_Routine_Adherence,Injury_Next_Season,BMI,Workload_Stress_Index,Sleep_Efficiency,Recovery_Score,Injury_Risk_Score,player_id,ACWR_x,ACWR_y
0,22,173,64,Midfielder,11.575308,36,1,77.460279,79.115738,284.487853,...,1,0,21.383942,539.599365,0.711713,3.821206,-5.709538,0,1.040417,1.040417
1,18,170,67,Midfielder,12.275869,37,2,72.634442,82.541688,250.579249,...,1,0,23.183391,606.035574,0.731821,4.117722,-3.109969,1,0.920421,0.920421
2,22,186,75,Forward,12.254896,12,2,77.06449,75.943631,269.119918,...,0,1,21.678807,528.588062,0.589902,3.151187,-3.31187,2,0.952107,0.952107
3,20,172,62,Defender,9.006678,11,1,82.810232,73.878324,226.376412,...,1,0,20.957274,464.10085,0.852815,3.63171,-3.841316,3,1.002575,1.002575
4,18,172,94,Midfielder,12.683668,10,2,76.772859,76.653043,229.021042,...,0,1,31.773932,664.366932,0.530453,2.977513,-1.501877,4,1.056202,1.056202


In [11]:
print(classification_report(y_test, clf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.88      0.91      0.90        80
           1       0.91      0.88      0.89        80

    accuracy                           0.89       160
   macro avg       0.89      0.89      0.89       160
weighted avg       0.89      0.89      0.89       160



In [12]:
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, VotingClassifier, StackingClassifier, ExtraTreesClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [13]:
lr = LogisticRegression(max_iter = 3000, random_state = 1)
lr.fit(X_train, y_train)
print(classification_report(y_test, lr.predict(X_test)))

              precision    recall  f1-score   support

           0       0.89      0.89      0.89        80
           1       0.89      0.89      0.89        80

    accuracy                           0.89       160
   macro avg       0.89      0.89      0.89       160
weighted avg       0.89      0.89      0.89       160



In [14]:
abc = AdaBoostClassifier()
abc_model = abc.fit(X_train, y_train)
abc_model
print(classification_report(y_test, abc.predict(X_test)))

              precision    recall  f1-score   support

           0       0.90      0.89      0.89        80
           1       0.89      0.90      0.89        80

    accuracy                           0.89       160
   macro avg       0.89      0.89      0.89       160
weighted avg       0.89      0.89      0.89       160



In [15]:
bc = BaggingClassifier()
bc_model = bc.fit(X_train, y_train)
bc_model
print(classification_report(y_test, bc.predict(X_test)))

              precision    recall  f1-score   support

           0       0.84      0.86      0.85        80
           1       0.86      0.84      0.85        80

    accuracy                           0.85       160
   macro avg       0.85      0.85      0.85       160
weighted avg       0.85      0.85      0.85       160



In [16]:
lr = LogisticRegression(max_iter = 5000, random_state = 1)
nb = GaussianNB()
dtc = DecisionTreeClassifier()
vc = VotingClassifier(estimators = [('tree', dtc), ('linear', lr), ('naive', nb)])
vc_model = vc.fit(X_train, y_train)
vc_model
print(classification_report(y_test, vc.predict(X_test)))

              precision    recall  f1-score   support

           0       0.86      0.86      0.86        80
           1       0.86      0.86      0.86        80

    accuracy                           0.86       160
   macro avg       0.86      0.86      0.86       160
weighted avg       0.86      0.86      0.86       160



In [17]:
sc = StackingClassifier(estimators = [('tree', dtc), ('linear', lr), ('naive', nb)])
sc_model = sc.fit(X_train, y_train)
sc_model
print(classification_report(y_test, sc.predict(X_test)))

              precision    recall  f1-score   support

           0       0.90      0.86      0.88        80
           1       0.87      0.90      0.88        80

    accuracy                           0.88       160
   macro avg       0.88      0.88      0.88       160
weighted avg       0.88      0.88      0.88       160



In [18]:
etsc = ExtraTreesClassifier()
etsc_model = etsc.fit(X_train, y_train)
etsc_model
print(classification_report(y_test, etsc.predict(X_test)))

              precision    recall  f1-score   support

           0       0.87      0.93      0.90        80
           1       0.92      0.86      0.89        80

    accuracy                           0.89       160
   macro avg       0.90      0.89      0.89       160
weighted avg       0.90      0.89      0.89       160



In [19]:
gbc = GradientBoostingClassifier()
gbc_model = gbc.fit(X_train, y_train)
gbc_model
print(classification_report(y_test, gbc.predict(X_test)))

              precision    recall  f1-score   support

           0       0.90      0.91      0.91        80
           1       0.91      0.90      0.91        80

    accuracy                           0.91       160
   macro avg       0.91      0.91      0.91       160
weighted avg       0.91      0.91      0.91       160



In [20]:
hgbc = HistGradientBoostingClassifier()
hgbc_model = hgbc.fit(X_train, y_train)
hgbc_model
print(classification_report(y_test, hgbc.predict(X_test)))

              precision    recall  f1-score   support

           0       0.84      0.88      0.86        80
           1       0.87      0.84      0.85        80

    accuracy                           0.86       160
   macro avg       0.86      0.86      0.86       160
weighted avg       0.86      0.86      0.86       160



In [21]:
dtc = DecisionTreeClassifier(criterion='gini',
    splitter='best',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features=None,
    random_state=None,
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    class_weight=None,
    ccp_alpha=0.0,
    monotonic_cst=None,)
dtc_model = dtc.fit(X_train, y_train)
dtc_model
print(classification_report(y_test, dtc.predict(X_test)))

              precision    recall  f1-score   support

           0       0.84      0.79      0.81        80
           1       0.80      0.85      0.82        80

    accuracy                           0.82       160
   macro avg       0.82      0.82      0.82       160
weighted avg       0.82      0.82      0.82       160



In [22]:
etc = ExtraTreeClassifier()
etc_model = etc.fit(X_train, y_train)
etc_model
print(classification_report(y_test, etc.predict(X_test)))

              precision    recall  f1-score   support

           0       0.77      0.76      0.77        80
           1       0.77      0.78      0.77        80

    accuracy                           0.77       160
   macro avg       0.77      0.77      0.77       160
weighted avg       0.77      0.77      0.77       160



In [23]:
nb_model = nb.fit(X_train, y_train)
nb_model
print(classification_report(y_test, nb.predict(X_test)))

              precision    recall  f1-score   support

           0       0.82      0.88      0.85        80
           1       0.87      0.81      0.84        80

    accuracy                           0.84       160
   macro avg       0.85      0.84      0.84       160
weighted avg       0.85      0.84      0.84       160



In [24]:
knc = KNeighborsClassifier()
knc_model = knc.fit(X_train, y_train)
knc_model
print(classification_report(y_test, knc.predict(X_test)))

              precision    recall  f1-score   support

           0       0.79      0.69      0.73        80
           1       0.72      0.81      0.76        80

    accuracy                           0.75       160
   macro avg       0.75      0.75      0.75       160
weighted avg       0.75      0.75      0.75       160



In [25]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.fit_transform(X_test)

In [26]:
X_train_scaled

array([[-0.89709414, -0.48216937, -0.91733414, ..., -0.4312675 ,
        -0.54084945, -0.54084945],
       [-0.04003458, -0.83255123, -0.53419407, ..., -1.05889122,
         0.50787888,  0.50787888],
       [-1.07702458, -0.31408278, -0.96854791, ..., -1.14309396,
         1.79494024,  1.79494024],
       ...,
       [ 0.27746301,  2.34427831,  1.77102571, ...,  1.70826951,
         0.83652029,  0.83652029],
       [ 0.4444138 ,  2.69069788,  2.20294778, ...,  1.03809169,
         0.64289176,  0.64289176],
       [-0.63029431,  0.71496799, -0.11661137, ...,  0.5943815 ,
        -0.93393125, -0.93393125]])

In [27]:
from sklearn.decomposition import PCA
pca = PCA()
X_train_pca = pca.fit_transform(X_train_scaled)

In [28]:
pca = PCA(n_components = 8)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled) 

In [29]:
X_train_pca.shape

(640, 8)

In [30]:
lr = LogisticRegression(max_iter = 3000, random_state = 1)
lr.fit(X_train_pca, y_train)
print(classification_report(y_test, lr.predict(X_test_pca)))

              precision    recall  f1-score   support

           0       0.89      0.85      0.87        80
           1       0.86      0.90      0.88        80

    accuracy                           0.88       160
   macro avg       0.88      0.88      0.87       160
weighted avg       0.88      0.88      0.87       160



In [31]:
df.shape

(800, 23)

In [32]:
# # 1. Check class balance
# df['Injury_Next_Season'].value_counts()

# 2. How did you split?
# Ideally:
from sklearn.model_selection import train_test_split
X1 = df.drop('Injury_Next_Season', axis=1)
y = df['Injury_Next_Season']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

# # 3. Did you oversample with SMOTE or similar? If yes, avoid applying it to the test set.
# model = RandomForestClassifier()
# model.fit(X_train, y_train)

# # 4. Print classification report from test set only
# from sklearn.metrics import classification_report
# print(classification_report(y_test, model.predict(X_test)))

# from sklearn.model_selection import cross_val_score
# from sklearn.ensemble import RandomForestClassifier

# model = RandomForestClassifier()
# scores = cross_val_score(model, X, y, cv=5, scoring='f1_macro')
# print("CV F1-macro scores:", scores)
# print("Mean F1-macro:", scores.mean())

import pandas as pd

# Automatically convert all string columns into one-hot encoded columns
X_encoded = pd.get_dummies(X1)

# Then split and train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, stratify=y, random_state=42
)

from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)

print(classification_report(y_test, model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.93      0.95      0.94        80
           1       0.95      0.93      0.94        80

    accuracy                           0.94       160
   macro avg       0.94      0.94      0.94       160
weighted avg       0.94      0.94      0.94       160



In [35]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)