In [2]:
import pandas as pd

In [6]:
base_file_path = 'data/'

df = pd.read_csv("data/polished3_with_moy_gdp.csv")
df.columns

Index(['ID', 'Name', 'Sex', 'Age', 'Height', 'Weight', 'Team', 'NOC', 'Games',
       'Year', 'Season', 'City', 'Sport', 'Event', 'Medal', 'Won Medal', 'BMI',
       'GDP'],
      dtype='object')

In [7]:
used_columns = ['NOC', 'Sex', 'Age', 'Height', 'Weight', 'Sport', 'Event']

df = df[used_columns]
sport_df = df.drop(columns=['Event'])
event_df = df.drop(columns=['Sport'])

In [10]:
# for numerical columns, get a average value for each sport/event having a separated row for Men and Woman
# for categorical columns, get the most common value for each sport/event (in this case is only the NOC country)

def process_sports_data(sports_df, eventOrSport):
    # Identify numerical columns
    numerical_columns = sports_df.select_dtypes(include=['number']).columns

    # Group by Sport and Gender
    def aggregate_func(group):
        averages = group[numerical_columns].mean()
        most_common_noc = group['NOC'].mode()[0] if not group['NOC'].mode().empty else None
        return pd.Series({**averages.to_dict(), 'NOC': most_common_noc})

    result = (
        sports_df
        .groupby([eventOrSport, 'Sex'])
        .apply(aggregate_func)
        .reset_index()
    )

    return result

In [11]:
sport_df = process_sports_data(sport_df, 'Sport')
event_df = process_sports_data(event_df, 'Event')

display(sport_df.head())
display(event_df.head())

  .apply(aggregate_func)
  .apply(aggregate_func)


Unnamed: 0,Sport,Sex,Age,Height,Weight,NOC
0,Archery,F,25.694224,167.1812,62.01359,CHN
1,Archery,M,26.795591,178.609218,77.092184,USA
2,Athletics,F,25.271651,169.354536,60.187641,USA
3,Athletics,M,25.635284,180.259659,74.315593,USA
4,Badminton,F,25.2,168.456061,61.586364,CHN


Unnamed: 0,Event,Sex,Age,Height,Weight,NOC
0,Archery Men's Individual,M,27.1231,178.323708,76.656535,USA
1,Archery Men's Team,M,26.161765,179.161765,77.935294,USA
2,Archery Women's Individual,F,26.09434,167.056604,61.758148,CHN
3,Archery Women's Team,F,24.916667,167.423333,62.51,CHN
4,"Athletics Men's 1,500 metres",M,24.43695,178.281525,65.104106,USA


In [12]:
sport_df.to_csv(base_file_path + 'yourSports.csv', index=False)
event_df.to_csv(base_file_path + 'yourEvents.csv', index=False)

In [17]:
sport_df.to_dict('records')

[{'Sport': 'Archery',
  'Sex': 'F',
  'Age': 25.69422423556059,
  'Height': 167.18120045300114,
  'Weight': 62.013590033975085,
  'NOC': 'CHN'},
 {'Sport': 'Archery',
  'Sex': 'M',
  'Age': 26.795591182364728,
  'Height': 178.60921843687376,
  'Weight': 77.09218436873748,
  'NOC': 'USA'},
 {'Sport': 'Athletics',
  'Sex': 'F',
  'Age': 25.27165084646247,
  'Height': 169.35453566885215,
  'Weight': 60.18764066934142,
  'NOC': 'USA'},
 {'Sport': 'Athletics',
  'Sex': 'M',
  'Age': 25.635284190223615,
  'Height': 180.25965885118438,
  'Weight': 74.31559279127238,
  'NOC': 'USA'},
 {'Sport': 'Badminton',
  'Sex': 'F',
  'Age': 25.2,
  'Height': 168.45606060606062,
  'Weight': 61.586363636363636,
  'NOC': 'CHN'},
 {'Sport': 'Badminton',
  'Sex': 'M',
  'Age': 26.364672364672366,
  'Height': 179.63960113960113,
  'Weight': 74.36253561253561,
  'NOC': 'CHN'},
 {'Sport': 'Baseball',
  'Sex': 'M',
  'Age': 26.3096926713948,
  'Height': 182.59929078014184,
  'Weight': 85.71808510638297,
  'NOC': 