In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import numpy as np

In [2]:
df = pd.read_csv('summary.csv', encoding='ISO-8859-1')

In [3]:
# Update the 'Country' column where 'Country_Code' is 'ROC'

df.loc[df['Country_Code'] == 'ROC', 'Country'] = 'Russia'

# Verify the update
print(df[df['Country_Code'] == 'ROC'])

     Year Country  Total_Medals  Gold_Medals  Silver_Medals  Bronze_Medals  \
539  2020  Russia            71           20             28             23   

     Total_Athletes  Events_Count  Host_or_Not Country_Code  population_total  \
539             318           204            0          ROC         145245148   

     population_growth  GNI_per_capita           GDP  GDP_growth  \
539          -0.143202           10660  1.493080e+12   -2.653655   

                      Region  GDP_per_capita  Population_Percentage  \
539  Europe and Central Asia     10108.32715               0.018664   

     GDP_Percentage    HDI  
539        0.017661  0.824  


In [4]:
df2 = df.copy()

In [5]:
df2

Unnamed: 0,Year,Country,Total_Medals,Gold_Medals,Silver_Medals,Bronze_Medals,Total_Athletes,Events_Count,Host_or_Not,Country_Code,population_total,population_growth,GNI_per_capita,GDP,GDP_growth,Region,GDP_per_capita,Population_Percentage,GDP_Percentage,HDI
0,2008,Afghanistan,1,0,0,1,4,4,0,AFG,26427200,2.002330,370,1.010930e+10,3.924980,South Asia,382.534000,0.003906,0.000159,0.437
1,2012,Afghanistan,1,0,0,1,6,6,0,AFG,30466500,4.077630,640,1.990730e+10,12.752300,South Asia,653.417000,0.004288,0.000265,0.482
2,1992,Algeria,2,1,0,1,35,27,0,ALG,26748300,2.323750,1920,4.800310e+10,1.800000,Middle East and North Africa,1794.620000,0.004918,0.001912,0.587
3,1996,Algeria,3,2,0,1,45,29,0,ALG,28984600,1.763320,1530,4.694160e+10,4.100000,Middle East and North Africa,1619.530000,0.005015,0.001489,0.608
4,2000,Algeria,5,1,1,3,47,42,0,ALG,30774600,1.402290,1610,5.479040e+10,3.800000,Middle East and North Africa,1780.380000,0.005036,0.001637,0.644
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
621,2020,Ghana,1,0,0,1,14,6,0,GHA,32180401,2.066269,2230,7.004310e+10,0.513942,Sub-Saharan Africa,2176.576218,0.004135,0.000829,0.611
622,2020,Grenada,1,0,0,1,6,2,0,GRD,123663,0.762219,7890,1.043411e+09,-13.756622,Latin America and Caribbean,8437.536782,0.000016,0.000012,0.779
623,2020,Kuwait,1,0,0,1,10,4,0,KWT,4360444,-1.832821,32790,1.075130e+11,-5.274021,Middle East and North Africa,24656.433710,0.000560,0.001272,0.806
624,2020,Moldova,1,0,0,1,19,16,0,MDA,2635130,-1.098032,4360,1.153075e+10,-8.275978,Europe and Central Asia,4375.778893,0.000339,0.000136,0.769


In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import pandas as pd

# Features
features = ['Total_Athletes', 'Events_Count', 'population_total', 'GNI_per_capita', 'GDP', 'GDP_per_capita',
            'population_growth', 'GDP_growth', 'Region', 'Host_or_Not', 'HDI', 'Population_Percentage', 'GDP_Percentage']
target = 'Total_Medals'
country_col = 'Country_Code'

# Preprocessing pipelines
numeric_features = ['Total_Athletes', 'Events_Count', 'population_total', 'GNI_per_capita', 'GDP', 'GDP_per_capita', 'population_growth', 'GDP_growth']
categorical_features = ['Region']

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Split data
train_df = df2[df2['Year'].between(1992, 2012)]
validation_df = df2[df2['Year'] == 2016]
test_df = df2[df2['Year'] == 2020]

X_train = train_df[features]
y_train = train_df[target]

X_val = validation_df[features]
y_val = validation_df[target]

X_test = test_df[features]
y_test = test_df[target]

# Model 1: Normalized Total_Medals
# Fit the scaler on the training target
scaler_y = StandardScaler().fit(y_train.values.reshape(-1, 1))
y_train_normalized = scaler_y.transform(y_train.values.reshape(-1, 1))

# Define the model
model_normalized = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Train the model
model_normalized.fit(X_train, y_train_normalized)

# Predict on validation set
y_val_pred_normalized = model_normalized.predict(X_val)
y_val_pred_normalized = scaler_y.inverse_transform(y_val_pred_normalized)
y_val_pred_normalized = y_val_pred_normalized.clip(min=0)  # Ensure non-negative predictions

# Predict on test set
y_test_pred_normalized = model_normalized.predict(X_test)
y_test_pred_normalized = scaler_y.inverse_transform(y_test_pred_normalized)
y_test_pred_normalized = y_test_pred_normalized.clip(min=0)  # Ensure non-negative predictions

# Model 2: Non-Normalized Total_Medals
model_non_normalized = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Train the model
model_non_normalized.fit(X_train, y_train)

# Predict on validation set
y_val_pred_non_normalized = model_non_normalized.predict(X_val)
y_val_pred_non_normalized = y_val_pred_non_normalized.clip(min=0)  # Ensure non-negative predictions

# Predict on test set
y_test_pred_non_normalized = model_non_normalized.predict(X_test)
y_test_pred_non_normalized = y_test_pred_non_normalized.clip(min=0)  # Ensure non-negative predictions

# Evaluate models
rmse_val_normalized = mean_squared_error(y_val, y_val_pred_normalized.flatten(), squared=False)
rmse_val_non_normalized = mean_squared_error(y_val, y_val_pred_non_normalized, squared=False)

rmse_test_normalized = mean_squared_error(y_test, y_test_pred_normalized.flatten(), squared=False)
rmse_test_non_normalized = mean_squared_error(y_test, y_test_pred_non_normalized, squared=False)

# Print evaluation metrics
print("Validation RMSE (Normalized Model):", rmse_val_normalized)
print("Validation RMSE (Non-Normalized Model):", rmse_val_non_normalized)

print("Test RMSE (Normalized Model):", rmse_test_normalized)
print("Test RMSE (Non-Normalized Model):", rmse_test_non_normalized)


Validation RMSE (Normalized Model): 8.128348963229447
Validation RMSE (Non-Normalized Model): 8.128348963229445
Test RMSE (Normalized Model): 7.87178593535285
Test RMSE (Non-Normalized Model): 7.8717859353528485


In [27]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score



In [28]:
# Features
features = ['Total_Athletes', 'Events_Count', 'population_total', 'GNI_per_capita', 'GDP', 'GDP_per_capita',
            'population_growth', 'GDP_growth', 'Region', 'Host_or_Not', 'HDI', 'Population_Percentage', 'GDP_Percentage']
target = 'Total_Medals'
country_col = 'Country_Code'

In [29]:
# Preprocessing pipelines
numeric_features = ['Total_Athletes', 'Events_Count', 'population_total', 'GNI_per_capita', 'GDP', 'GDP_per_capita', 'population_growth', 'GDP_growth']
categorical_features = ['Region']

In [30]:


numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [31]:
# Split data
train_df = df2[df2['Year'].between(1992, 2012)]
validation_df = df2[df2['Year'] == 2016]
test_df = df2[df2['Year'] == 2020]

In [32]:
# Prepare Feature and Target Variables

X_train = train_df[features]
y_train = train_df[target]

X_val = validation_df[features]
y_val = validation_df[target]

X_test = test_df[features]
y_test = test_df[target]

In [33]:
# Model 1: Normalized Total_Medals
# Fit the scaler on the training target
scaler_y = StandardScaler().fit(y_train.values.reshape(-1, 1))
y_train_normalized = scaler_y.transform(y_train.values.reshape(-1, 1))

# Define the model
model_normalized = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Train the model
model_normalized.fit(X_train, y_train_normalized)

# Predict on validation set
y_val_pred_normalized = model_normalized.predict(X_val)
y_val_pred_normalized = scaler_y.inverse_transform(y_val_pred_normalized)
y_val_pred_normalized = y_val_pred_normalized.clip(min=0)  # Ensure non-negative predictions

# Predict on test set
y_test_pred_normalized = model_normalized.predict(X_test)
y_test_pred_normalized = scaler_y.inverse_transform(y_test_pred_normalized)
y_test_pred_normalized = y_test_pred_normalized.clip(min=0)  # Ensure non-negative predictions


In [34]:


# Model 2: Non-Normalized Total_Medals
model_non_normalized = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Train the model
model_non_normalized.fit(X_train, y_train)

# Predict on validation set
y_val_pred_non_normalized = model_non_normalized.predict(X_val)
y_val_pred_non_normalized = y_val_pred_non_normalized.clip(min=0)  # Ensure non-negative predictions

# Predict on test set
y_test_pred_non_normalized = model_non_normalized.predict(X_test)
y_test_pred_non_normalized = y_test_pred_non_normalized.clip(min=0)  # Ensure non-negative predictions

# Evaluate models
rmse_val_normalized = mean_squared_error(y_val, y_val_pred_normalized.flatten(), squared=False)
rmse_val_non_normalized = mean_squared_error(y_val, y_val_pred_non_normalized, squared=False)

rmse_test_normalized = mean_squared_error(y_test, y_test_pred_normalized.flatten(), squared=False)
rmse_test_non_normalized = mean_squared_error(y_test, y_test_pred_non_normalized, squared=False)


In [35]:
# Calculate MSE and R² scores
mse_val_normalized = mean_squared_error(y_val, y_val_pred_normalized.flatten())
mse_val_non_normalized = mean_squared_error(y_val, y_val_pred_non_normalized)

mse_test_normalized = mean_squared_error(y_test, y_test_pred_normalized.flatten())
mse_test_non_normalized = mean_squared_error(y_test, y_test_pred_non_normalized)

r2_val_normalized = r2_score(y_val, y_val_pred_normalized.flatten())
r2_val_non_normalized = r2_score(y_val, y_val_pred_non_normalized)

r2_test_normalized = r2_score(y_test, y_test_pred_normalized.flatten())
r2_test_non_normalized = r2_score(y_test, y_test_pred_non_normalized)


In [36]:
# Create DataFrames for results
# Get actual values and country codes from the test DataFrame
actual_values_test = y_test.values  # Actual total medals for the test set
countries_test = test_df[country_col].values  # Country codes for the test set


In [37]:

# Results DataFrame for normalized model
results_df_rf_normalized = pd.DataFrame({
    'Country_Code': countries_test,
    'Actual_Total_Medals': actual_values_test,
    'Predicted_Total_Medals_RF_Normalized': y_test_pred_normalized.flatten()
})

# Results DataFrame for non-normalized model
results_df_rf_non_normalized = pd.DataFrame({
    'Country_Code': countries_test,
    'Actual_Total_Medals': actual_values_test,
    'Predicted_Total_Medals_RF_Non_Normalized': y_test_pred_non_normalized
})

In [38]:
# Print the results DataFrames
print("\nResults for Normalized Model:")
print(results_df_rf_normalized)

print("\nResults for Non-Normalized Model:")
print(results_df_rf_non_normalized)


Results for Normalized Model:
   Country_Code  Actual_Total_Medals  Predicted_Total_Medals_RF_Normalized
0           USA                  113                            127.733102
1           CHN                   88                            104.056969
2           JPN                   58                             73.418855
3           GBR                   65                             41.209125
4           ROC                   71                             34.890263
..          ...                  ...                                   ...
86          GHA                    1                              0.729361
87          GRD                    1                              0.000000
88          KWT                    1                              0.000000
89          MDA                    1                              0.000000
90          SYR                    1                              0.000000

[91 rows x 3 columns]

Results for Non-Normalized Model:
   Country_

In [39]:
# Print evaluation metrics
print("Validation RMSE (Normalized Model):", rmse_val_normalized)
print("Validation MSE (Normalized Model):", mse_val_normalized)
print("Validation R² Score (Normalized Model):", r2_val_normalized)

print("Validation RMSE (Non-Normalized Model):", rmse_val_non_normalized)
print("Validation MSE (Non-Normalized Model):", mse_val_non_normalized)
print("Validation R² Score (Non-Normalized Model):", r2_val_non_normalized)

print("Test RMSE (Normalized Model):", rmse_test_normalized)
print("Test MSE (Normalized Model):", mse_test_normalized)
print("Test R² Score (Normalized Model):", r2_test_normalized)

print("Test RMSE (Non-Normalized Model):", rmse_test_non_normalized)
print("Test MSE (Non-Normalized Model):", mse_test_non_normalized)
print("Test R² Score (Non-Normalized Model):", r2_test_non_normalized)

Validation RMSE (Normalized Model): 8.128348963229447
Validation MSE (Normalized Model): 66.07005686803323
Validation R² Score (Normalized Model): 0.8065280800061035
Validation RMSE (Non-Normalized Model): 8.128348963229445
Validation MSE (Non-Normalized Model): 66.0700568680332
Validation R² Score (Non-Normalized Model): 0.8065280800061037
Test RMSE (Normalized Model): 7.87178593535285
Test MSE (Normalized Model): 61.965013812018945
Test R² Score (Normalized Model): 0.8309597104663555
Test RMSE (Non-Normalized Model): 7.8717859353528485
Test MSE (Non-Normalized Model): 61.965013812018924
Test R² Score (Non-Normalized Model): 0.8309597104663555


In [40]:
# Create a DataFrame for evaluation metrics
evaluation_metrics = pd.DataFrame({
    'Metric': [
        'Validation RMSE (Normalized Model)', 'Validation MSE (Normalized Model)', 'Validation R² Score (Normalized Model)',
        'Validation RMSE (Non-Normalized Model)', 'Validation MSE (Non-Normalized Model)', 'Validation R² Score (Non-Normalized Model)',
        'Test RMSE (Normalized Model)', 'Test MSE (Normalized Model)', 'Test R² Score (Normalized Model)',
        'Test RMSE (Non-Normalized Model)', 'Test MSE (Non-Normalized Model)', 'Test R² Score (Non-Normalized Model)'
    ],
    'Value': [
        rmse_val_normalized, mse_val_normalized, r2_val_normalized,
        rmse_val_non_normalized, mse_val_non_normalized, r2_val_non_normalized,
        rmse_test_normalized, mse_test_normalized, r2_test_normalized,
        rmse_test_non_normalized, mse_test_non_normalized, r2_test_non_normalized
    ]
})

# Print the evaluation metrics DataFrame
print("\nEvaluation Metrics:")
print(evaluation_metrics)


Evaluation Metrics:
                                        Metric      Value
0           Validation RMSE (Normalized Model)   8.128349
1            Validation MSE (Normalized Model)  66.070057
2       Validation R² Score (Normalized Model)   0.806528
3       Validation RMSE (Non-Normalized Model)   8.128349
4        Validation MSE (Non-Normalized Model)  66.070057
5   Validation R² Score (Non-Normalized Model)   0.806528
6                 Test RMSE (Normalized Model)   7.871786
7                  Test MSE (Normalized Model)  61.965014
8             Test R² Score (Normalized Model)   0.830960
9             Test RMSE (Non-Normalized Model)   7.871786
10             Test MSE (Non-Normalized Model)  61.965014
11        Test R² Score (Non-Normalized Model)   0.830960
