# Olympics Data Analysis and Medal Prediction

A complete analysis and machine learning model built on Olympic historical data.

## Install Required Libraries

In [None]:
!pip install pandas numpy matplotlib seaborn plotly



##  Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

##  Upload Dataset
Click to download data set https://drive.google.com/file/d/1ugJY0esAtx4BJ0aYeMnjLFesZ6FShHnh/view?usp=sharing

In [None]:
from google.colab import files
uploaded = files.upload()
df = pd.read_csv('olympics_data.csv', encoding='ISO-8859-1')
df.head()

Saving olympics_data.csv to olympics_data (13).csv


Unnamed: 0,City,Year,Sport,Discipline,Event,Athlete,Gender,Country_Code,Country,Event_gender,Medal
0,Montreal,1976.0,Aquatics,Diving,3m springboard,"KÖHLER, Christa",Women,GDR,East Germany,W,Silver
1,Montreal,1976.0,Aquatics,Diving,3m springboard,"KOSENKOV, Aleksandr",Men,URS,Soviet Union,M,Bronze
2,Montreal,1976.0,Aquatics,Diving,3m springboard,"BOGGS, Philip George",Men,USA,United States,M,Gold
3,Montreal,1976.0,Aquatics,Diving,3m springboard,"CAGNOTTO, Giorgio Franco",Men,ITA,Italy,M,Silver
4,Montreal,1976.0,Aquatics,Diving,10m platform,"WILSON, Deborah Keplar",Women,USA,United States,W,Bronze


##  Initial Data Exploration

In [None]:
print(df.head())
print(df.info())
print(df.describe())

       City    Year     Sport Discipline           Event  \
0  Montreal  1976.0  Aquatics     Diving  3m springboard   
1  Montreal  1976.0  Aquatics     Diving  3m springboard   
2  Montreal  1976.0  Aquatics     Diving  3m springboard   
3  Montreal  1976.0  Aquatics     Diving  3m springboard   
4  Montreal  1976.0  Aquatics     Diving    10m platform   

                    Athlete Gender Country_Code        Country Event_gender  \
0           KÖHLER, Christa  Women          GDR   East Germany            W   
1       KOSENKOV, Aleksandr    Men          URS   Soviet Union            M   
2      BOGGS, Philip George    Men          USA  United States            M   
3  CAGNOTTO, Giorgio Franco    Men          ITA          Italy            M   
4    WILSON, Deborah Keplar  Women          USA  United States            W   

    Medal  
0  Silver  
1  Bronze  
2    Gold  
3  Silver  
4  Bronze  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15433 entries, 0 to 15432
Data columns (to

##  Data Cleaning

In [None]:
print("Missing Values:\n", df.isnull().sum())
df_cleaned = df.dropna()
print(df_cleaned.info())

Missing Values:
 City            117
Year            117
Sport           117
Discipline      117
Event           117
Athlete         117
Gender          117
Country_Code    117
Country         117
Event_gender    117
Medal           117
dtype: int64
<class 'pandas.core.frame.DataFrame'>
Index: 15316 entries, 0 to 15432
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   City          15316 non-null  object 
 1   Year          15316 non-null  float64
 2   Sport         15316 non-null  object 
 3   Discipline    15316 non-null  object 
 4   Event         15316 non-null  object 
 5   Athlete       15316 non-null  object 
 6   Gender        15316 non-null  object 
 7   Country_Code  15316 non-null  object 
 8   Country       15316 non-null  object 
 9   Event_gender  15316 non-null  object 
 10  Medal         15316 non-null  object 
dtypes: float64(1), object(10)
memory usage: 1.4+ MB
None


##  Top 10 Countries by Medal Count

In [None]:
medals_by_country = df_cleaned.groupby('Country')['Medal'].count().sort_values(ascending=False)
fig = px.bar(medals_by_country.head(10), x=medals_by_country.head(10).index, y=medals_by_country.head(10).values,
             labels={'x': 'Country', 'y': 'Total Medals'}, title='Top 10 Countries by Medal Count', color=medals_by_country.head(10).values)
fig.show()

## Total Medals Over the Years

In [None]:
medals_over_years = df_cleaned.groupby('Year')['Medal'].count()
fig = px.line(x=medals_over_years.index, y=medals_over_years.values, labels={'x': 'Year', 'y': 'Total Medals'},
              title='Medals Won Over the Years', markers=True)
fig.show()

##  Gender Distribution in Events

In [None]:
gender_distribution = df_cleaned['Gender'].value_counts()
fig = px.pie(names=gender_distribution.index, values=gender_distribution.values, title='Gender Distribution in Olympic Events')
fig.show()

##  Top 10 Athletes by Medal Count

In [None]:
athlete_medal_count = df_cleaned.groupby('Athlete')['Medal'].count().sort_values(ascending=False)
fig = px.bar(athlete_medal_count.head(10), x=athlete_medal_count.head(10).values, y=athlete_medal_count.head(10).index,
             orientation='h', title='Top 10 Athletes by Medal Count')
fig.show()