In [None]:
############
# LIBRARIES
############

import pandas as pd
import numpy as np

from sklearn.preprocessing import RobustScaler

# Adjusting Row Column Settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.width', 500)

In [None]:
############
# READ DATA
############

game_workload_df = pd.read_csv("/content/drive/MyDrive/Injury Prediction/game_workload.csv")
injuries_df = pd.read_csv("/content/drive/MyDrive/Injury Prediction/injuries.csv")
metrics_df = pd.read_csv("/content/drive/MyDrive/Injury Prediction/metrics.csv")

In [None]:
#########
# ASTYPE
#########

game_workload_df['date'] = pd.to_datetime(game_workload_df['date'])
injuries_df['date'] = pd.to_datetime(injuries_df['date'])
metrics_df['date'] = pd.to_datetime(metrics_df['date'])

game_workload_df["athlete_id"] = game_workload_df["athlete_id"].astype(object)
injuries_df["athlete_id"] = injuries_df["athlete_id"].astype(object)
metrics_df["athlete_id"] = metrics_df["athlete_id"].astype(object)

In [None]:
#################
# DATASET MERGING
#################

merged_df = pd.merge(metrics_df, game_workload_df, on=['athlete_id', 'date'], how='outer')


injuries_df["injuries_status"] = "injured"
final_df = pd.merge(merged_df, injuries_df, on=['athlete_id', 'date'], how='outer')
final_df['injuries_status'].fillna("non_injured", inplace=True)


#metrics_dates = set(metrics_df['date'])
#workload_dates = set(game_workload_df['date'])

#metrics_only_dates = metrics_dates - workload_dates
#workload_only_dates = workload_dates - metrics_dates

# len(metrics_only_dates) ## 24 different dates
## game_workload data is missing because athletes have not trained on some days
final_df['game_workload'].fillna(0, inplace=True)

##############
# PIVOT TABLE
##############

def reshape_athlete_data(df):
    reshaped_df = df.pivot_table(
        index=['athlete_id', 'date', 'game_workload', 'injuries_status'],
        columns='metric',
        values='value'
    ).reset_index()
    reshaped_df.columns.name = None
    column_order = ['athlete_id', 'date', 'hip_mobility', 'groin_squeeze',
                    'game_workload', 'injuries_status']

    return reshaped_df[column_order]

pivot_df = reshape_athlete_data(final_df)

In [None]:
#####################
# FEATURE ENGINEERING
#####################

pivot_df["injuries_status"] = pivot_df["injuries_status"].map({"non_injured": 0, "injured": 1})



def calculate_resting_days(group):
    resting_days = []
    current_resting = 0
    for workload in group['game_workload']:
        if workload == 0:
            current_resting += 1
            resting_days.append(current_resting)
        else:
            current_resting = 0
            resting_days.append(current_resting)

    return resting_days

pivot_df['resting'] = pivot_df.groupby('athlete_id').apply(calculate_resting_days).explode().reset_index(level=0, drop=True).astype(int)




def create_injury_risk_features(df):
    # Workload-Based Features
    df['workload_7d'] = df.groupby('athlete_id')['game_workload'].rolling(7, min_periods=1).sum().reset_index(0, drop=True)

    # Acute/Chronic workload ratio (Last 7 days / Last 28 days average)
    df['acwr'] = df.groupby('athlete_id')['game_workload'].rolling(7, min_periods=1).mean().reset_index(0, drop=True) / \
                         df.groupby('athlete_id')['game_workload'].rolling(28, min_periods=1).mean().reset_index(0, drop=True)

    # Workload change rate (daily)
    df['workload_change'] = df.groupby('athlete_id')['game_workload'].pct_change()


    # Number of rest days (last 7 days)
    df['rest_days_7d'] = df.groupby('athlete_id')['resting'].rolling(7, min_periods=1).sum().reset_index(0, drop=True)

    # Mobility trend analysis
    df['hip_trend'] = df.groupby('athlete_id')['hip_mobility'].rolling(7, min_periods=1).mean().reset_index(0, drop=True)
    df['hip_change'] = df.groupby('athlete_id')['hip_mobility'].pct_change()

    df['groin_trend'] = df.groupby('athlete_id')['groin_squeeze'].rolling(7, min_periods=1).mean().reset_index(0, drop=True)
    df['groin_change'] = df.groupby('athlete_id')['groin_squeeze'].pct_change()


    # Number of injuries in the last 30 days
    df['injuries_30d'] = df.groupby('athlete_id')['injuries_status'].rolling(30, min_periods=1).sum().reset_index(0, drop=True)

    # Number of days since the last injury
    df['days_since_injury'] = df.groupby('athlete_id')['injuries_status'].apply(
        lambda x: x.replace({1: 0}).groupby((x != 0).cumsum()).cumcount()
    ).reset_index(0, drop=True)

    # Workload risk score
    df['workload_risk'] = (df['workload_7d'] - df['workload_7d'].mean()) / df['workload_7d'].std()
    df['workload_risk'] = 1 / (1 + np.exp(-df['workload_risk']))

    # Overall risk score
    df['overall_risk'] = (
        df['workload_risk'] * 0.2 +
        (1 - df['hip_trend'].clip(0, 1)) * 0.2 +
        (1 - df['groin_trend'].clip(0, 1)) * 0.2 +
        (df['injuries_30d'] > 0).astype(float) * 0.2 +
        (1 - df['rest_days_7d'] / 7) * 0.2
    )

    return df


pivot_df = create_injury_risk_features(pivot_df)



#def fill_missing_values(df):
    #numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    #for col in numeric_cols:
       # df[col] = df.groupby('athlete_id')[col].transform(lambda x: x.fillna(x.median()))
      #  df[col] = df[col].fillna(df[col].median())
   # return df

# Missing values
pivot_df.replace([np.inf, -np.inf], np.nan, inplace=True)
pivot_df = pivot_df.fillna(method='bfill').fillna(method='ffill')

In [None]:
########
# EXPORT
########

file_path = "/content/drive/MyDrive/Injury Prediction/final_data.csv"
pivot_df.to_csv(file_path, index=False)