In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import os

def load_data():
    """Load the raw CSV datasets."""
    player_stats_path = 'D:/Data science classes slides pdf/Football_analysis/football_analytics_injury_prevention/raw/player_statistics.csv'
    match_events_path = 'D:/Data science classes slides pdf/Football_analysis/football_analytics_injury_prevention/raw/match_events.csv'
    positional_data_path = 'D:/Data science classes slides pdf/Football_analysis/football_analytics_injury_prevention/raw/positional_data.csv'
    biomechanical_data_path = 'D:/Data science classes slides pdf/Football_analysis/football_analytics_injury_prevention/raw/biomechanical_data.csv'

    player_stats = pd.read_csv(player_stats_path)
    match_events = pd.read_csv(match_events_path)
    positional_data = pd.read_csv(positional_data_path)
    biomechanical_data = pd.read_csv(biomechanical_data_path)
    
    return player_stats, match_events, positional_data, biomechanical_data

def merge_datasets(player_stats, match_events, biomechanical_data):
    """Merge the datasets based on common columns."""
    player_stats['name'] = player_stats['name'].astype(str).str.lower().str.strip()
    biomechanical_data['Subject'] = biomechanical_data['Subject'].astype(str)
    
    # Example mapping of Subject to player names
    subject_to_name = {
        '1': 'manuel neuer',
        '2': 'yann sommer',
        '3': 'sven ulreich',
        '4': 'johannes schenk',
        '5': 'matthijs de ligt',
        '6': 'dayot upamecano',
        '7': 'lucas hernández',
        '8': 'alphonso davies',
        '9': 'daley blind',
        '10': 'joão cancelo',
        # Add more mappings as needed
    }
    
    biomechanical_data['name'] = biomechanical_data['Subject'].map(subject_to_name).str.lower().str.strip()

    # Merge player statistics with biomechanical data based on player name
    merged_data = pd.merge(player_stats, biomechanical_data, on='name', how='left')
    
    # Merge the result with match events based on the club name and match date
    merged_data = pd.merge(merged_data, match_events, left_on='club', right_on='HOME_TEAM_NAME', how='left', suffixes=('_player', '_match'))

    return merged_data

def add_injury_columns(csv_data):
    """Add injury-related columns."""
    np.random.seed(42)
    csv_data['Injury'] = np.random.choice([0, 1], size=len(csv_data), p=[0.8, 0.2])

    injury_locations = ['Knee', 'Ankle', 'Hamstring', 'Shoulder', np.nan]
    csv_data['InjuryLoc'] = np.where(csv_data['Injury'] == 1, np.random.choice(injury_locations, size=len(csv_data)), np.nan)

    dates = pd.date_range(start='2020-01-01', end='2023-01-01')
    csv_data['InjuryOnDate'] = np.where(csv_data['Injury'] == 1, np.random.choice(dates, size=len(csv_data)), pd.NaT)

    return csv_data

def handle_missing_values(csv_data):
    """Handle missing values in the dataset."""
    # Fill missing values for InjuryLoc with 'None'
    csv_data['InjuryLoc'] = csv_data['InjuryLoc'].fillna('None')

    # Fill NaNs in numerical columns with the median value
    numerical_columns = csv_data.select_dtypes(include=['float64', 'int64']).columns
    for column in numerical_columns:
        if csv_data[column].isna().sum() > 0:
            csv_data[column] = csv_data[column].fillna(csv_data[column].median())

    # Fill NaNs in categorical columns with the mode
    categorical_columns = csv_data.select_dtypes(include=['object']).columns
    for column in categorical_columns:
        if csv_data[column].isna().sum() > 0:
            if csv_data[column].dropna().empty:
                # If the entire column is NaN, fill with a placeholder like 'Unknown'
                csv_data[column] = csv_data[column].fillna('Unknown')
            else:
                # Fill with mode if the column has valid entries
                csv_data[column] = csv_data[column].fillna(csv_data[column].mode()[0])

    # Optionally, drop columns that are still entirely NaN
    csv_data = csv_data.dropna(axis=1, how='all')

    # Print the shape after handling missing values
    print(f"Data shape after handling missing values: {csv_data.shape}")

    return csv_data

def encode_and_scale_features(csv_data):
    """Encode categorical variables and scale numerical features."""
    # Encoding categorical features
    categorical_features = ['club', 'position', 'InjuryLoc']
    for feature in categorical_features:
        encoder = LabelEncoder()
        csv_data[feature] = encoder.fit_transform(csv_data[feature].astype(str))
    
    # Scaling numerical features
    numerical_features = csv_data.select_dtypes(include=['float64', 'int64']).columns
    
    if len(csv_data) > 0 and len(numerical_features) > 0:
        print(f"Scaling {len(numerical_features)} numerical features for {len(csv_data)} samples.")
        scaler = StandardScaler()
        csv_data[numerical_features] = scaler.fit_transform(csv_data[numerical_features])
    else:
        print("No data available for scaling.")
    
    return csv_data

def save_processed_data(csv_data, output_path):
    """Save the processed dataset to a CSV file."""
    csv_data.to_csv(output_path, index=False)
    print(f"Enhanced dataset saved successfully at: {output_path}")

def split_and_save_data(csv_data, output_dir):
    """Split the data into train and test sets and save them to CSV files."""
    X = csv_data.drop(columns=['Injury'])
    y = csv_data['Injury']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Save the split datasets
    X_train.to_csv(f'{output_dir}/X_train.csv', index=False)
    X_test.to_csv(f'{output_dir}/X_test.csv', index=False)
    y_train.to_csv(f'{output_dir}/y_train.csv', index=False)
    y_test.to_csv(f'{output_dir}/y_test.csv', index=False)
    
    print(f"Train and test datasets saved successfully in: {output_dir}")

def main():
    # Specify the output file path
    output_dir = 'D:/Data science classes slides pdf/Football_analysis/football_analytics_injury_prevention/processed'
    merged_data_path = f'{output_dir}/merged_data.csv'
    
    # Create the output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Step 1: Load the data
    player_stats, match_events, positional_data, biomechanical_data = load_data()
    
    # Step 2: Merge the datasets
    merged_data = merge_datasets(player_stats, match_events, biomechanical_data)
    print(f"Data shape after merging: {merged_data.shape}")
    
    # Step 3: Add injury-related columns
    merged_data = add_injury_columns(merged_data)
    print(f"Data shape after adding injury columns: {merged_data.shape}")
    
    # Step 4: Handle missing values
    merged_data = handle_missing_values(merged_data)
    print(f"Data shape after handling missing values: {merged_data.shape}")
    
    # Step 5: Encode categorical variables and scale numerical features
    merged_data = encode_and_scale_features(merged_data)
    print(f"Data shape after encoding and scaling: {merged_data.shape}")
    
    # Step 6: Save the processed data
    save_processed_data(merged_data, merged_data_path)
    
    # Step 7: Split the data into train and test sets and save them
    split_and_save_data(merged_data, output_dir)

if __name__ == "__main__":
    main()


Data shape after merging: (57057, 90)
Data shape after adding injury columns: (57057, 90)


  csv_data[column] = csv_data[column].fillna(csv_data[column].mode()[0])


Data shape after handling missing values: (57057, 90)
Data shape after handling missing values: (57057, 90)
Scaling 45 numerical features for 57057 samples.
Data shape after encoding and scaling: (57057, 90)
Enhanced dataset saved successfully at: D:/Data science classes slides pdf/Football_analysis/football_analytics_injury_prevention/processed/merged_data.csv
Train and test datasets saved successfully in: D:/Data science classes slides pdf/Football_analysis/football_analytics_injury_prevention/processed


In [16]:
print(merged_data[['name', 'Volume', 'Pace', 'RFSI25', 'LFSI25', 'Injury', 'InjuryLoc', 'InjuryOnDate']].head())


               name  Volume  Pace RFSI25 LFSI25 Injury InjuryLoc InjuryOnDate
0      Manuel Neuer       0   NaN    NaN    NaN    NaN       NaN          NaN
1       Yann Sommer       0   NaN    NaN    NaN    NaN       NaN          NaN
2      Sven Ulreich       0   NaN    NaN    NaN    NaN       NaN          NaN
3   Johannes Schenk       0   NaN    NaN    NaN    NaN       NaN          NaN
4  Matthijs de Ligt       0   NaN    NaN    NaN    NaN       NaN          NaN


In [17]:
print(biomechanical_data.head())  # Check if the biomechanical data has values
print(biomechanical_data['Subject'].unique())  # Verify unique subjects in biomechanical data


   Subject                  FileName  Age  Height  Mass Gender Dominance  \
0        1         RBDS001static.txt   22   181.0  62.0      M         R   
1        1  RBDS001runT25markers.txt   22   181.0  62.0      M         R   
2        1   RBDS001runT25forces.txt   22   181.0  62.0      M         R   
3        1  RBDS001runT35markers.txt   22   181.0  62.0      M         R   
4        1   RBDS001runT35forces.txt   22   181.0  62.0      M         R   

         Level  Experience  SessionsPerWk  ...  ROber  LOber  RHIPABD  \
0  Competitive           4              3  ...     43     40     16.8   
1  Competitive           4              3  ...     43     40     16.8   
2  Competitive           4              3  ...     43     40     16.8   
3  Competitive           4              3  ...     43     40     16.8   
4  Competitive           4              3  ...     43     40     16.8   

   LHIPABD  RHIPEXT  LHIPEXT    RHIPER LHIPER     RHIPIR  name  
0   21.575   16.575   21.675  9.666667 

In [18]:
# Example subject-to-name mapping
subject_to_name = {
    '1': 'manuel neuer',
    '2': 'yann sommer',
    '3': 'sven ulreich',
    '4': 'johannes schenk',
    '5': 'matthijs de ligt',
    '6': 'dayot upamecano',
    '7': 'lucas hernández',
    '8': 'alphonso davies',
    '9': 'daley blind',
    '10': 'joão cancelo',
    # Ensure all subjects are mapped here
    # ...
}

biomechanical_data['name'] = biomechanical_data['Subject'].map(subject_to_name).str.lower().str.strip()


In [19]:
merged_data = pd.merge(player_stats, biomechanical_data, on='name', how='left')


In [20]:
print(merged_data[['name', 'Volume', 'Pace', 'RFSI25', 'LFSI25']].head())


               name Volume  Pace RFSI25 LFSI25
0      Manuel Neuer    NaN   NaN    NaN    NaN
1       Yann Sommer    NaN   NaN    NaN    NaN
2      Sven Ulreich    NaN   NaN    NaN    NaN
3   Johannes Schenk    NaN   NaN    NaN    NaN
4  Matthijs de Ligt    NaN   NaN    NaN    NaN


In [9]:
print(biomechanical_data['Subject'].unique())
print(player_stats['name'].unique())


[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28]
['Manuel Neuer' 'Yann Sommer' 'Sven Ulreich' 'Johannes Schenk'
 'Matthijs de Ligt' 'Dayot Upamecano' 'Lucas Hernández' 'Alphonso Davies'
 'Daley Blind' 'João Cancelo' 'Benjamin Pavard' 'Noussair Mazraoui'
 'Josip Stanisic' 'Bouna Sarr' 'Joshua Kimmich' 'Leon Goretzka'
 'Ryan Gravenberch' 'Jamal Musiala' 'Paul Wanner' 'Arijon Ibrahimovic'
 'Kingsley Coman' 'Sadio Mané' 'Leroy Sané' 'Serge Gnabry' 'Thomas Müller'
 'Mathys Tel' 'Eric Maxim Choupo-Moting' 'Gregor Kobel' 'Marcel Lotka'
 'Alexander Meyer' 'Luca Unbehaun' 'Nico Schlotterbeck' 'Niklas Süle'
 'Mats Hummels' 'Soumaïla Coulibaly' 'Antonios Papadopoulos'
 'Raphaël Guerreiro' 'Tom Rothe' 'Nico Schulz' 'Julian Ryerson'
 'Marius Wolf' 'Thomas Meunier' 'Mateu Morey Bauzà' 'Felix Passlack'
 'Salih Özcan' 'Emre Can' 'Abdoulaye Kamara' 'Jude Bellingham'
 'Mahmoud Dahoud' 'Julian Brandt' 'Giovanni Reyna' 'Marco Reus'
 'Göktan Gürpüz' 'Karim Adeyemi' 'Ja

In [10]:
# Example mapping should be verified or updated
subject_to_name = {
    '1': 'manuel neuer',
    '2': 'yann sommer',
    '3': 'sven ulreich',
    '4': 'johannes schenk',
    # Update mappings if necessary
}

biomechanical_data['name'] = biomechanical_data['Subject'].map(subject_to_name).str.lower().str.strip()


In [22]:
import pandas as pd

def fill_missing_values(data):
    # For numerical columns, fill NaNs with the median value
    numerical_columns = data.select_dtypes(include=['float64', 'int64']).columns
    for column in numerical_columns:
        median_value = data[column].median()
        data[column] = data[column].fillna(median_value)
    
    # For categorical columns, fill NaNs with the mode value
    categorical_columns = data.select_dtypes(include=['object']).columns
    for column in categorical_columns:
        if not data[column].mode().empty:
            mode_value = data[column].mode()[0]
        else:
            mode_value = 'Unknown'
        data[column] = data[column].fillna(mode_value)
    
    return data

# Apply the function to fill missing values
merged_data_filled = fill_missing_values(merged_data)

# Verify that missing values have been handled
print(merged_data_filled.isnull().sum())


Unnamed: 0      0
name            0
full_name       0
age             0
height          0
             ... 
RHIPEXT       515
LHIPEXT       515
RHIPER        515
LHIPER        515
RHIPIR        515
Length: 67, dtype: int64


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, ou

In [23]:
import pandas as pd

def fill_missing_values(data):
    # For numerical columns, fill NaNs with the median value
    numerical_columns = data.select_dtypes(include=['float64', 'int64']).columns
    for column in numerical_columns:
        if data[column].notna().any():  # Check if there are any non-NaN values
            median_value = data[column].median()
            data[column] = data[column].fillna(median_value)
        else:
            print(f"Column '{column}' is entirely NaN; filling with 0.")
            data[column] = data[column].fillna(0)
    
    # For categorical columns, fill NaNs with the mode value
    categorical_columns = data.select_dtypes(include=['object']).columns
    for column in categorical_columns:
        if data[column].notna().any():  # Check if there are any non-NaN values
            mode_value = data[column].mode()[0]
            data[column] = data[column].fillna(mode_value)
        else:
            print(f"Column '{column}' is entirely NaN; filling with 'Unknown'.")
            data[column] = data[column].fillna('Unknown')
    
    return data

# Apply the function to fill missing values
merged_data_filled = fill_missing_values(merged_data)

# Verify that missing values have been handled
print(merged_data_filled.isnull().sum())


Column 'Subject' is entirely NaN; filling with 0.
Column 'Age' is entirely NaN; filling with 0.
Column 'Height' is entirely NaN; filling with 0.
Column 'Mass' is entirely NaN; filling with 0.
Column 'Experience' is entirely NaN; filling with 0.
Column 'SessionsPerWk' is entirely NaN; filling with 0.
Column 'Treadmill' is entirely NaN; filling with 0.
Column 'Aslphalt' is entirely NaN; filling with 0.
Column 'Grass' is entirely NaN; filling with 0.
Column 'Trail' is entirely NaN; filling with 0.
Column 'Sand' is entirely NaN; filling with 0.
Column 'Concrete' is entirely NaN; filling with 0.
Column 'SurfaceAlt' is entirely NaN; filling with 0.
Column 'Pace' is entirely NaN; filling with 0.
Column 'ShoeSize' is entirely NaN; filling with 0.
Column 'ShoePairs' is entirely NaN; filling with 0.
Column 'ShoeComfort' is entirely NaN; filling with 0.
Column 'RThomas' is entirely NaN; filling with 0.
Column 'LThomas' is entirely NaN; filling with 0.
Column 'ROber' is entirely NaN; filling with 

In [24]:
import pandas as pd

def fill_missing_values(data):
    # For numerical columns, fill NaNs with the median value
    numerical_columns = data.select_dtypes(include=['float64', 'int64']).columns
    for column in numerical_columns:
        if data[column].notna().any():  # Check if there are any non-NaN values
            median_value = data[column].median()
            data[column] = data[column].fillna(median_value)
        else:
            print(f"Column '{column}' is entirely NaN; filling with 0.")
            data[column] = data[column].fillna(0)
    
    # For categorical columns, fill NaNs with the mode value
    categorical_columns = data.select_dtypes(include=['object']).columns
    for column in categorical_columns:
        if data[column].notna().any():  # Check if there are any non-NaN values
            mode_value = data[column].mode()[0]
            data[column] = data[column].fillna(mode_value)
        else:
            print(f"Column '{column}' is entirely NaN; filling with 'Unknown'.")
            data[column] = data[column].fillna('Unknown')
    
    return data

def drop_empty_columns(data):
    """Drop columns that are entirely NaN."""
    data = data.dropna(axis=1, how='all')
    return data

# Apply the function to fill missing values
merged_data_filled = fill_missing_values(merged_data)

# Drop empty columns
merged_data_cleaned = drop_empty_columns(merged_data_filled)

# Verify the final dataset
print(merged_data_cleaned.isnull().sum())
print(f"Final data shape: {merged_data_cleaned.shape}")


Unnamed: 0    0
name          0
full_name     0
age           0
height        0
             ..
RHIPEXT       0
LHIPEXT       0
RHIPER        0
LHIPER        0
RHIPIR        0
Length: 67, dtype: int64
Final data shape: (515, 67)
