# Define the goal:

To predict the total number of medals a country might win based on GDP of the previous year, population and other factors.


# Load the Dataset

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('/content/clean_olympic_data.csv')
df.head()

Unnamed: 0,gold,silver,bronze,total,gdp,gdp_year,population,country_Albania,country_Algeria,country_Argentina,...,country_Taiwan,country_Tajikistan,country_Thailand,country_Tunisia,country_Turkey,country_Uganda,country_Ukraine,country_United States,country_Uzbekistan,country_Zambia
0,40,44,42,126,81695.19,2023,334.9,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,40,27,24,91,12614.06,2023,1410.7,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,20,12,13,45,33834.39,2023,124.5,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,18,19,16,53,64711.77,2023,26.6,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,16,26,22,64,44460.82,2023,68.2,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Select the features and target variable

In [3]:
df.drop(columns=['gold', 'silver', 'bronze'], axis=1, inplace=True)
df.head()

Unnamed: 0,total,gdp,gdp_year,population,country_Albania,country_Algeria,country_Argentina,country_Armenia,country_Australia,country_Austria,...,country_Taiwan,country_Tajikistan,country_Thailand,country_Tunisia,country_Turkey,country_Uganda,country_Ukraine,country_United States,country_Uzbekistan,country_Zambia
0,126,81695.19,2023,334.9,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,91,12614.06,2023,1410.7,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,45,33834.39,2023,124.5,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,53,64711.77,2023,26.6,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,64,44460.82,2023,68.2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
print(df.shape)

(90, 94)


In [4]:
X = df.drop('total', axis=1) #features
y = df['total'] #target

## Split the data into train and test sets

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

# Standardize the features

In [6]:
scaler = StandardScaler()

# fit and transform only the 'gdp' and 'population' columns
X_train[['gdp', 'population']] = scaler.fit_transform(X_train[['gdp', 'population']])

# transform the same columns in X_test
X_test[['gdp', 'population']] = scaler.transform(X_test[['gdp', 'population']])

# Model Selection and training

In [7]:
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_squared_error,mean_absolute_error

In [9]:
# Define models for comparison
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree Regressor': DecisionTreeRegressor(),
    'Random Forest Regressor': RandomForestRegressor(),
    'XGBoost Regressor': XGBRegressor()
}

# perform evaluation for each model and store the results in the lists
r2_scores=[]
mean_squared = []
mean_absolute = []
model_list =[]
for model_name,model in models.items():
    model.fit(X_train,y_train)
    ypred = model.predict(X_test)
    r2_scores.append(r2_score(y_test,ypred))
    mean_squared.append(mean_squared_error(y_test,ypred))
    mean_absolute.append(mean_absolute_error(y_test,ypred))
    model_list.append(model_name)

# create a dataframe for the evaluation metrics
eval_data = pd.DataFrame()
eval_data['Model'] = model_list
eval_data['R2 Score'] = r2_scores
eval_data ['Mean Squared Errors'] = mean_squared
eval_data['Mean Absolute Errors'] = mean_absolute
eval_data

Unnamed: 0,Model,R2 Score,Mean Squared Errors,Mean Absolute Errors
0,Linear Regression,-0.10932,1001.87046,18.745034
1,Decision Tree Regressor,0.378772,561.055556,10.833333
2,Random Forest Regressor,0.382649,557.553628,11.295
3,XGBoost Regressor,0.045642,861.917594,14.019599


Random Forest Regressor would be the better model to use in this case.

## Hyper-parameter tuning

In [10]:
from sklearn.model_selection import RandomizedSearchCV
param_grid = {
    'n_estimators': [100, 200, 300, 500, 1000],  # Number of trees
    'max_depth': [None, 10, 20, 30, 50],  # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],  # Minimum samples needed to split a node
    'min_samples_leaf': [1, 2, 4],  # Minimum samples in each leaf
    'max_features': ['auto', 'sqrt', 'log2'],  # Number of features considered for splitting
    'bootstrap': [True, False],  # Bootstrap sampling
}

rf_random_search = RandomizedSearchCV(
    estimator = models["Random Forest Regressor"],
    param_distributions=param_grid,
    n_iter=100,  # Number of random configurations to try
    cv=5,  # 5-fold cross-validation

    random_state=42,
    n_jobs=-1  # Use all available cores
)

# Fit the model
rf_random_search.fit(X_train, y_train)



145 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
97 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
skl

## Train the model

In [11]:
# get the best estimator and fit the model
best_rf = rf_random_search.best_estimator_
best_rf.fit(X_train,y_train)
y_pred = best_rf.predict(X_test)
print(y_pred)

[11.08   6.065  5.975 43.    26.925  5.32   6.285  3.495  8.715  5.92
  4.59   9.84   3.11   7.19   4.13  34.195  7.625  9.485]


In [12]:
print("R2 SCORE : ", (r2_score(y_test,ypred)))
print("MEAN ABSOLUTE ERROR : ",(mean_absolute_error(y_test,ypred)))
print("MEAN SQUARED ERROR : ",(mean_squared_error(y_test,ypred)))

R2 SCORE :  0.04564225673675537
MEAN ABSOLUTE ERROR :  14.0195988284217
MEAN SQUARED ERROR :  861.9175935752633


Not the best model, clearly, if you have some suggestions on how to better do this... i'd really appreciate the input!