In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTE

class RunningDataset:
    def __init__(self):
        self.filename = 'day_approach_maskedID_timeseries.csv'
        self.WINDOW_DAYS = 7
        self.base_metrics = ['nr. sessions', 'total km', 'km Z3-4', 'km Z5-T1-T2', 'km sprinting', 
                             'strength training', 'hours alternative', 'perceived exertion', 
                             'perceived trainingSuccess', 'perceived recovery']
        self.identifiers = ['Athlete ID', 'Date']
        self.class_name = 'injury'
        self.fixed_columns = ['Athlete ID', 'injury', 'Date']
        self.data_types_metrics = [float] * len(self.base_metrics)
        self.data_types_fixed_columns = [int] * len(self.identifiers)
        self.data = pd.read_csv(self.filename)
        self.data.columns = [f"{col}.0" if i < 10 else col for i, col in enumerate(self.data.columns)]
        self.standard_scaler = StandardScaler()
        self.min_max_scaler = MinMaxScaler()
        self.split_data()

    def split_data(self):
        all_ids = self.data[self.identifiers[0]].unique()
        train_ids = np.random.choice(all_ids, int(0.865 * len(all_ids)), replace=False)
        test_ids = np.setdiff1d(all_ids, train_ids)
        self.train = self.data[self.data[self.identifiers[0]].isin(train_ids)]
        self.test = self.data[self.data[self.identifiers[0]].isin(test_ids)]

    def long_form(self, df):
        df_long = pd.wide_to_long(df, stubnames=self.base_metrics, i=self.fixed_columns, j='Offset', sep='.')
        df_long.reset_index(inplace=True)
        df_long[self.identifiers[1]] = df_long[self.identifiers[1]] - (self.WINDOW_DAYS - df_long['Offset'])
        df_long.drop(columns='Offset', inplace=True)
        df_long.drop_duplicates(subset=self.identifiers, keep='first', inplace=True)
        return df_long
    
    def z_score_normalization(self, df):
        for metric in self.base_metrics:
            df[metric] = df.groupby([self.identifiers[0]])[metric].transform(
                lambda x: self.standard_scaler.fit_transform(x.values.reshape(-1, 1)).flatten()
            )
        return df.reset_index(drop=True)
    
    def min_max_normalization(self, df):
        for metric in self.base_metrics:
            df[metric] = df.groupby([self.identifiers[0]])[metric].transform(
                lambda x: self.min_max_scaler.fit_transform(x.values.reshape(-1, 1)).flatten()
            )
        return df.reset_index(drop=True)
    
    def wide_form(self, df_long, days):
        df_long = df_long.groupby(self.identifiers[0], as_index=False).apply(self.fill_missing_dates).reset_index(drop=True)
        df_long.sort_values(by=self.identifiers, inplace=True)
        athlete_info = df_long[self.fixed_columns]
        df_rolled = pd.DataFrame(index=athlete_info.index).join(athlete_info)
        for day in range(days):
            shifted = df_long.groupby(self.identifiers[0])[self.base_metrics].shift(day).add_suffix(f'.{days - 1 - day}')
            df_rolled = df_rolled.join(shifted)
        metric_columns = [f'{metric}.{day}' for day in range(days) for metric in self.base_metrics]
        df_rolled = df_rolled[metric_columns + self.fixed_columns]
        df_rolled.dropna(inplace=True)
        df_rolled.reset_index(drop=True, inplace=True)
        df_rolled.sort_values(by=self.identifiers, inplace=True)
        df_rolled[self.identifiers[1]] = df_rolled[self.identifiers[1]] + 1
        df_rolled = df_rolled.sort_values(by=self.identifiers).reset_index(drop=True)
        df_rolled = df_rolled.astype(dict(zip(df_rolled.columns, self.data_types_metrics * days + self.data_types_fixed_columns)))
        return df_rolled
    
    def fill_missing_dates(self, group):
        min_date = group[self.identifiers[1]].min()
        max_date = group[self.identifiers[1]].max()
        int_range = range(min_date, max_date + 1)
        group = group.set_index(self.identifiers[1]).reindex(int_range).rename_axis(self.identifiers[1]).reset_index()
        group[self.identifiers[0]] = group[self.identifiers[0]].ffill()
        return group

    def normalise(self, dataset):
        long = self.long_form(dataset)
        long = self.z_score_normalization(long)
        long = self.min_max_normalization(long)
        return self.wide_form(long, 7)
    
    def multi_resample(self, dataset):
        # Step 1: Balanced Sampling
        balanced_data = self.balanced_sampling(dataset)
        # Step 2: Unbalanced Sampling
        unbalanced_data = self.unbalanced_sampling(balanced_data, 650, 0.136)
        # Step 3: Synthetic Sampling
        X_resampled, y_resampled = self.synthetic_sampling(unbalanced_data, 0.136)
        
        return X_resampled, y_resampled

    def balanced_sampling(self, data):
        groups = data.groupby('Athlete ID')
        balanced_data = []

        for _, group in groups:
            injured = group[group[self.class_name] == 1]
            uninjured = group[group[self.class_name] == 0]

            n_samples = max(len(injured), len(uninjured))  # Change to max to allow oversampling the smaller group
            injured_sample = injured.sample(n_samples, replace=True)  # Allow replacement to enable oversampling
            uninjured_sample = uninjured.sample(n_samples, replace=False)  # Ensure both classes are balanced

            balanced_data.append(injured_sample)
            balanced_data.append(uninjured_sample)

        return pd.concat(balanced_data)

    def unbalanced_sampling(self, balanced_data, injury_count, sampling_ratio):
        injured = balanced_data[balanced_data['injury'] == 1]
        uninjured = balanced_data[balanced_data['injury'] == 0]

        num_injured = injury_count
        num_uninjured = int(num_injured / sampling_ratio)

        injured_sample = injured.sample(num_injured, replace=False)
        uninjured_sample = uninjured.sample(num_uninjured, replace=False)

        return pd.concat([injured_sample, uninjured_sample])

    def synthetic_sampling(self, data, sampling_rate=1):
        X = data.drop(columns=self.identifiers)
        y = data[self.class_name]

        # First, apply Tomek Links to remove majority class samples in Tomek pairs
        tl = TomekLinks()
        X_cleaned, y_cleaned = tl.fit_resample(X, y)

        # Then, use SMOTE to oversample the minority class based on the new class distribution
        smote = SMOTE(sampling_strategy=sampling_rate, random_state=42)  
        X_resampled, y_resampled = smote.fit_resample(X_cleaned, y_cleaned)

        return pd.DataFrame(X_resampled, columns=X.columns), pd.Series(y_resampled)
    
    def transform_3d(self, data): 
        ids = data['Athlete ID']
        labels = data[self.class_name]
        dates = data['Date']
        features = data.loc[:,~data.columns.isin(self.fixed_columns)]  # Adjust this slice depending on where your label columns are

        # Check if the number of feature columns is exactly 70
        if features.shape[1] != 70:
            raise ValueError("The number of feature columns is not 70")

        # Reshape the feature array to a tensor that rearranges the data as specified
        # This will create a tensor of shape (N, 10, 7) where each row in the 10x7 matrix
        # is filled by taking every 10th element starting from indices 0, 1, 2,..., 9
        reshaped_features = np.zeros((len(features), 10, 7))
        for i in range(10):  # For each row in the 10x7 matrix
            reshaped_features[:, i, :] = features.iloc[:, i::10].to_numpy()[:,:7]

        # Initialize a new numpy array with an extra column for zeros
        expanded_features = np.zeros((reshaped_features.shape[0], 10, 8))

        # Copy the original features into the new array leaving the last column as zeros
        expanded_features[:, :, 1:] = reshaped_features
        
        return ids, labels, dates, expanded_features

    def compute_gasf(series):
        """ Compute the Gramian Angular Summation Field (GASF) for a time series. """
        # Map the normalized series to an angular representation
        phi = np.arccos(series)
        
        # Create the GASF matrix
        gasf = np.array([np.cos(phi_i + phi_j) for phi_i in phi for phi_j in phi]).reshape(len(phi), len(phi))
        
        return gasf

    def image_encoding(self, data):
        """ Transform each 8-element array in a 40000 x 10 x 8 dataset to an 8x8 GASF. """
        n_samples, n_rows, n_cols = data.shape
        if n_cols != 8:
            raise ValueError("Each inner array must have 8 elements.")
        
        # Initialize an empty array to store the GASF matrices
        gasf_matrices = np.empty((n_samples, n_rows, n_cols, n_cols))
        
        # Compute the GASF for each 8-element array
        for i in range(n_samples):
            for j in range(n_rows):
                gasf_matrices[i, j] = self.compute_gasf(data[i, j])
        
        return gasf_matrices

    
    def preprocess(self):
        self.split_data(self.data)
        self.train = self.normalise(self.train)
        self.test = self.normalise(self.test)
        X_train, y_train = self.multi_resample(self.train)

In [4]:
data = RunningDataset()
filtered_data = data.data[data.data['Athlete ID'] == 11]
print(filtered_data[filtered_data['injury'] == 1])

Empty DataFrame
Columns: [nr. sessions.0, total km.0, km Z3-4.0, km Z5-T1-T2.0, km sprinting.0, strength training.0, hours alternative.0, perceived exertion.0, perceived trainingSuccess.0, perceived recovery.0, nr. sessions.1, total km.1, km Z3-4.1, km Z5-T1-T2.1, km sprinting.1, strength training.1, hours alternative.1, perceived exertion.1, perceived trainingSuccess.1, perceived recovery.1, nr. sessions.2, total km.2, km Z3-4.2, km Z5-T1-T2.2, km sprinting.2, strength training.2, hours alternative.2, perceived exertion.2, perceived trainingSuccess.2, perceived recovery.2, nr. sessions.3, total km.3, km Z3-4.3, km Z5-T1-T2.3, km sprinting.3, strength training.3, hours alternative.3, perceived exertion.3, perceived trainingSuccess.3, perceived recovery.3, nr. sessions.4, total km.4, km Z3-4.4, km Z5-T1-T2.4, km sprinting.4, strength training.4, hours alternative.4, perceived exertion.4, perceived trainingSuccess.4, perceived recovery.4, nr. sessions.5, total km.5, km Z3-4.5, km Z5-T1-T

(42766, 10, 8)  first row: [ 0.   5.8  0.   0.   0.   0.  16.4  0. ]


Unnamed: 0,nr. sessions.0,total km.0,km Z3-4.0,km Z5-T1-T2.0,km sprinting.0,strength training.0,hours alternative.0,perceived exertion.0,perceived trainingSuccess.0,perceived recovery.0,...,km Z5-T1-T2.6,km sprinting.6,strength training.6,hours alternative.6,perceived exertion.6,perceived trainingSuccess.6,perceived recovery.6,Athlete ID,injury,Date
0,1.0,5.8,0.0,0.6,1.2,0.0,0.0,0.11,0.0,0.18,...,0.0,0.0,0.0,1.0,0.1,0.0,0.15,0,0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.01,-0.01,-0.01,...,0.5,1.2,0.0,0.0,0.1,0.0,0.17,0,0,1
2,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.1,0.0,0.17,...,0.0,0.0,0.0,0.0,-0.01,-0.01,-0.01,0,0,2
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.01,-0.01,-0.01,...,0.0,0.0,1.0,0.0,0.1,0.0,0.17,0,0,3
4,1.0,0.0,0.0,0.0,0.0,0.0,1.08,0.08,0.0,0.18,...,0.0,0.0,0.0,0.0,0.11,0.0,0.17,0,0,4
