<a href="https://colab.research.google.com/github/HriddhiDoley/predicting_olympic_medal/blob/main/predict_olympic_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introduction
This project explores the use of machine learning (ML) and deep learning (DL) techniques to predict the total number of Olympic medals a country will win. By analyzing various factors like GDP, population, and sports infrastructure, this study provides valuable insights into the key determinants of Olympic success. Leveraging models such as Random Forest and Neural Networks, the findings highlight the interplay between economic and sports-related features in influencing medal counts.


In [None]:
# Olympic Medals Prediction Project

## Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

Step 1: Load the Data set

In [None]:
# Load Dataset
data = pd.read_csv('gni-olympics.csv')
data.head()

Unnamed: 0,iso,ioc,name,continent,population,gdp,olympics_index,sports_index,olympicsIndex,sportsIndex,total,gold,silver,bronze
0,ARG,ARG,Argentina,South America,45376763,383066977654,19.597142,9.324537,19.597142,9.324537,3,0,1,2
1,ARM,ARM,Armenia,Asia,2963234,12645459214,19.681457,13.497324,19.681457,13.497324,4,0,2,2
2,AUS,AUS,Australia,Oceania,25687041,1330900925057,31.170099,11.073845,31.170099,11.073845,46,17,7,22
3,AUT,AUT,Austria,Europe,8917205,428965397959,12.212139,15.923033,12.212139,15.923033,7,1,1,5
4,AZE,AZE,Azerbaijan,Europe,10110116,42607176471,18.213838,13.103344,18.213838,13.103344,7,0,3,4


# Step 2: Understand the Data
1. Basic Analysis
2. Univariate
3. Bivariate / Multivariate analysis  

## 2.1 Basic Analysis
1. How big is the data?
2. How does the data look like?
3. What is the data type of cols?
4. Are there any missing values?
5. How does the data look mathematically?
6. Are there any duplicate values?
7. How is the correlation between cols?

In [None]:
# 1. Shape of the dataset
print("Dataset Shape:", data.shape)

In [None]:
# 2. First few rows
print("First 5 rows:")
print(data.head())

In [None]:
# 3. Random sample of 5 rows
print("Random Sample:")
print(data.sample(5))

In [None]:
# 4. Dataset Info
data.info()

In [None]:
# 5. Null values
print("Missing Values:")
print(data.isnull().sum())

In [None]:
# 6. Statistical summary
print("Statistical Summary:")
print(data.describe())

In [None]:
# 7. Duplicated rows
print("Number of Duplicated Rows:", data.duplicated().sum())

In [None]:
# 8. Drop duplicates
data.drop_duplicates(inplace=True)

In [None]:
# 9. Correlation matrix
print("Correlation Matrix:")
plt.figure(figsize=(10, 8))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

## 2.2 EDA: Univariate Analysis

In [None]:
# Countplot for a categorical variable (continent)
sns.countplot(x='continent', data=data)
plt.title('Countplot of Continents')
plt.show()


In [None]:
# Pie chart for total medals distribution
labels = data['continent'].unique()
sizes = data['continent'].value_counts()
plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=140)
plt.title('Pie Chart of Continents')
plt.axis('equal')
plt.show()

In [None]:
# Histogram for GDP
plt.hist(data['gdp'], bins=10, alpha=0.7)
plt.title('Histogram of GDP')
plt.xlabel('GDP')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Distplot for population
sns.histplot(data['population'], kde=True, bins=20)
plt.title('Population Distribution')
plt.show()


In [None]:
# Boxplot for sports index
sns.boxplot(y='sports_index', data=data)
plt.title('Boxplot of Sports Index')
plt.show()

In [None]:
# Summary statistics for specific features
print("GDP Statistics:")
print("Min:", data['gdp'].min())
print("Max:", data['gdp'].max())
print("Mean:", data['gdp'].mean())
print("Standard Deviation:", data['gdp'].std())
print("Variance:", data['gdp'].var())

In [None]:
# Skewness of numerical columns
print("Skewness of Numerical Features:")
print(data[['gdp', 'population', 'sports_index']].skew())

## 2.3 EDA: Multivariate Analysis

In [None]:
# Scatterplot for GDP vs. Total Medals
sns.scatterplot(x='gdp', y='total', data=data)
plt.title('Scatterplot of GDP vs Total Medals')
plt.show()

In [None]:
# Barplot for medals by continent
sns.barplot(x='continent', y='total', data=data, ci=None)
plt.title('Barplot of Total Medals by Continent')
plt.show()

In [None]:
# Boxplot for GDP by continent
sns.boxplot(x='continent', y='gdp', data=data)
plt.title('Boxplot of GDP by Continent')
plt.show()

In [None]:
# Distplot for Total Medals
sns.histplot(data['total'], kde=True, bins=20)
plt.title('Distribution of Total Medals')
plt.show()

In [None]:
# Clustermap of correlations
sns.clustermap(data.corr(), cmap='coolwarm', annot=True)
plt.title('Clustermap of Feature Correlations')
plt.show()

In [None]:
# Pairplot for key numerical features
sns.pairplot(data[['gdp', 'population', 'sports_index', 'total']])
plt.title('Pairplot of Key Features')
plt.show()

In [None]:
# Lineplot for GDP and Total Medals
sns.lineplot(x='gdp', y='total', data=data)
plt.title('Lineplot of GDP vs Total Medals')
plt.show()


# Feature Engineering

In [None]:

# Normalize numerical features (GDP and population)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
data[['gdp', 'population']] = scaler.fit_transform(data[['gdp', 'population']])

# Feature Engineering
data['gdp_population_interaction'] = data['gdp'] * data['population']

# Split dataset into features and target
target = 'total'
X = data[['gdp', 'population', 'sports_index', 'gdp_population_interaction']]
y = data[target]

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Exploratory Data Analysis

# Correlation Matrix
plt.figure(figsize=(10, 8))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# Scatter Plot
plt.scatter(data['gdp'], data['total'])
plt.xlabel('GDP')
plt.ylabel('Total Medals')
plt.title('GDP vs Total Medals')
plt.show()

## Machine Learning Models

# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

# Decision Tree
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)

# Random Forest
rf = RandomForestRegressor(random_state=42, n_estimators=100)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# Evaluation Function
def evaluate_model(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return mae, mse, r2

# Evaluate Models
results = {
    'Linear Regression': evaluate_model(y_test, y_pred_lr),
    'Decision Tree': evaluate_model(y_test, y_pred_dt),
    'Random Forest': evaluate_model(y_test, y_pred_rf),
}

results_df = pd.DataFrame(results, index=['MAE', 'MSE', 'R2']).T
print(results_df)

## Deep Learning Model

# Build Neural Network
model = Sequential([
    Dense(32, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(1)  # Output layer
])

model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error'])

# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=0)

# Evaluate the model
loss, mae = model.evaluate(X_test, y_test)
y_pred_nn = model.predict(X_test)

# Add Neural Network to Results
nn_results = evaluate_model(y_test, y_pred_nn.flatten())
results_df.loc['Neural Network'] = nn_results
print(results_df)

# Plot Training History
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.title('Neural Network Training History')
plt.show()
