In [127]:
import pandas as pd
import folium
import numpy as np

In [None]:
summer_olympics_hosts = {
    1924: "FRA",  # France
    1928: "NED",  # Netherlands
    1932: "USA",  # United States
    1936: "GER",  # Germany
    1948: "GBR",  # United Kingdom
    1952: "FIN",  # Finland
    1956: "AUS",  # Australia
    1960: "ITA",  # Italy
    1964: "JPN",  # Japan
    1968: "MEX",  # Mexico
    1972: "FRG",  # West Germany (FRG = Federal Republic of Germany)
    1976: "CAN",  # Canada
    1980: "URS",  # Soviet Union
    1984: "USA",  # United States
    1988: "KOR",  # South Korea
    1992: "ESP",  # Spain
    1996: "USA",  # United States
    2000: "AUS",  # Australia
    2004: "GRE",  # Greece
    2008: "CHN",  # China
    2012: "GBR",  # United Kingdom
    2016: "BRA",  # Brazil
    2020: "JPN",  # Japan
    2024: "FRA",  # France
}

In [6]:
df_full = pd.read_excel('Olympics_dataset_Final_product.xlsx')

In [None]:
df = df_full[df_full['season'].isin(['Summer Olympics', 'Winter Olympics'])]
df_swimming = df[df['sport'] == 'Swimming']
df_swimming

In [None]:
medal_to_points = {
    'Gold': 3,
    'Silver': 2,
    'Bronze': 1,    
    None: 0,
}

df_swimming['medal_points'] = df_swimming['medal'].apply(lambda x: medal_to_points.get(x, 0))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_swimming['medal_points'] = df_swimming['medal'].apply(lambda x: medal_to_points.get(x, 0))


In [None]:
def medalists_on_map(df, only_gold=False, name_map_suffix="map", after_year=None):
    if only_gold:
        df_medalists = df[df['medal'] == 'Gold']
    else:
        df_medalists = df_swimming[df_swimming['medal'].notna()]
    
    if after_year is not None:
        df_medalists = df_medalists[df_medalists['year'] > after_year]

    df_medalists = df_medalists.dropna(subset=['latitude', 'longitude'])
    m = folium.Map(location=[df_medalists['latitude'].mean(), df_medalists['longitude'].mean()], zoom_start=2)

    for _, row in df_medalists.iterrows():
        folium.CircleMarker(
            location=[row['latitude'], row['longitude']],
            radius=5,
            popup=f"{row['full_name']} - {row['medal']}",
            color='gold' if row['medal'] == 'Gold' else 'silver' if row['medal'] == 'Silver' else 'brown',
            fill=True,
            fill_opacity=0.7
        ).add_to(m)

    m.save(f'medalists_map_{name_map_suffix}.html')



medalists_on_map(df_swimming, only_gold=False, name_map_suffix="only_gold_swimmers", after_year=None)

In [None]:
# Make sure your DataFrame df_swimming is loaded and available.
# For example:
# df_swimming = pd.read_csv('your_swimming_data.csv') # Or however you load it

# The column in df_swimming that contains the event descriptions
from jakub_script import extract_all_features


event_column_name = 'event' # Based on your df.info(), this seems to be the correct column

# Check if the column exists to prevent errors
if event_column_name not in df_analysis.columns:
    print(f"Error: Column '{event_column_name}' not found in df_swimming.")
    print(f"Please check the column name. Available columns are: {df_analysis.columns.tolist()}")
else:
    # Apply the feature extraction
    # It's good practice to work on a copy if you want to keep the original df_swimming unchanged
    df_swimming_with_features = extract_all_features(df_analysis.copy(), event_column_name)
    
    # Display the first few rows of the new DataFrame with extracted features
    print("DataFrame with new features (head):")
    print(df_swimming_with_features.head())
    
    # Display info to see the new columns
    print("\nInfo for the DataFrame with new features:")
    df_swimming_with_features.info()
    
    # You can also check value counts for some of the new columns
    print("\nValue counts for 'gender' (new column):")
    print(df_swimming_with_features['gender'].value_counts(dropna=False).head())

    print("\nValue counts for 'event_type' (new column):")
    print(df_swimming_with_features['event_type'].value_counts(dropna=False).head())
    
    print("\nValue counts for 'stroke' (new column):")
    print(df_swimming_with_features['stroke'].value_counts(dropna=False).head())

In [None]:
cols = ['id', 'gender', 'age', 'birth_country_full', 'year', 'team', 'event', 'medal_points', 'place', 'Population', 'Gini', ]


gdp_cols = [col for col in df_swimming.columns if 'GDP' in col]
df_analysis = df_swimming[cols + gdp_cols] 


df_analysis = df_analysis[df_analysis['year'] >= 1924]
df_analysis['event'] = df_analysis['event'].apply(lambda x: x.replace(', Women', "").replace(', Men', ""))
df_analysis['relay'] = df_analysis['event'].apply(lambda x: 1 if 'Relay' in x else 0)
df_analysis['Population'] = df_analysis['Population'].astype(float)

df_analysis['GDP per Capita'] = df_analysis['GDP per capita, PPP (constant 2021 international $)'].apply(lambda x: float(x) if x != 'no_data' else np.nan)

df_analysis['Gini'] = df_analysis['Gini'].apply(lambda x: float(x) if x != 'no_data' else np.nan)

#df_analysis['gender'] = df_analysis['gender'].apply(lambda x: 1 if x == 'Male' else 0)
#df_analysis['distance'] = df['event'].apply(lambda x: x.split(' metres ')[0])


#proportion of #participants to population
#distance of event - 
#style of event - 

In [None]:
aggregated_df = df_analysis.groupby(by=['team', 'year', 'gender']).agg(avg_age = ('age', 'mean'), 
                                                        gini = ('Gini', 'mean'),
                                                        points_sum = ('medal_points', 'sum'),
                                                        population = ('Population', 'mean'),
                                                        gdp_per_capita = ('GDP per Capita', 'mean'),
                                                        number_participants = ('id', 'nunique'),
                                                        relays = ('relay', 'sum'))


aggregated_df['host_indicator'] = [
    (country, year) in summer_olympics_hosts for country, year, gender in aggregated_df.index
]

aggregated_df = aggregated_df.dropna()
aggregated_df['proportion_participants_to_population'] = aggregated_df['number_participants'] / aggregated_df['population']

#nomralize
aggregated_df['population'] = (aggregated_df['population'] - aggregated_df['population'].mean()) / aggregated_df['population'].std()
aggregated_df['gdp_per_capita'] = (aggregated_df['gdp_per_capita'] - aggregated_df['gdp_per_capita'].mean()) / aggregated_df['gdp_per_capita'].std()



aggregated_df.head(15)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,avg_age,gini,points_sum,population,gdp_per_capita,number_participants,relays,host_indicator,proportion_participants_to_population
team,year,gender,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AFG,2020,Male,21.0,36.8,0,-0.026802,-0.834724,1,0,False,2.559575e-08
AFG,2024,Male,25.0,36.8,0,-0.001411,-0.876025,1,0,False,2.344804e-08
ALB,1992,Male,21.0,27.0,0,-0.280723,-0.799896,1,0,False,3.046458e-07
ALB,2004,Female,17.0,31.0,0,-0.282028,-0.578834,1,0,False,3.227205e-07
ALB,2004,Male,21.0,31.0,0,-0.282028,-0.578834,1,0,False,3.227205e-07
ALB,2008,Female,21.0,30.0,0,-0.282778,-0.456639,1,0,False,3.341207e-07
ALB,2008,Male,16.0,30.0,0,-0.282778,-0.456639,1,0,False,3.341207e-07
ALB,2012,Female,16.0,29.1,0,-0.283366,-0.387479,1,0,False,3.436421e-07
ALB,2012,Male,20.0,29.1,0,-0.283366,-0.387479,1,0,False,3.436421e-07
ALB,2016,Female,18.0,29.0,0,-0.283452,-0.333011,1,0,False,3.450814e-07


In [174]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Assume you have:
X = aggregated_df.drop(columns = ['points_sum'])
y = aggregated_df['points_sum']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Initialize the Random Forest Regressor
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

# Step 3: Train the model on the training set
rf_regressor.fit(X_train, y_train)

# Step 4: Predict on the test set
y_pred = rf_regressor.predict(X_test)

# Step 5: Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.4f}")
print(f"R^2 Score: {r2:.4f}")


Mean Squared Error: 27.3362
R^2 Score: 0.6286
