# Download and Import Dependacies

In [4]:
import pandas as pd

# Data Retrieval


In [44]:
from google.colab import drive
drive.mount('/drive')
%cd ..
%cd /drive/MyDrive/DIC_PROJECT
!ls

Drive already mounted at /drive; to attempt to forcibly remount, call drive.mount("/drive", force_remount=True).
/drive/My Drive
/drive/MyDrive/DIC_PROJECT
Olympic_Athlete_Biography.csv	   Olympic_Event_Results.csv  Olympic_Medal_Tally_History.csv
Olympic_Athlete_Event_Details.csv  Olympic_Games_Summary.csv  population_total_long.csv


In [45]:
Athlete_Events = pd.read_csv('Olympic_Athlete_Event_Details.csv')
Event_Results = pd.read_csv('Olympic_Event_Results.csv')
Athlete_Biography = pd.read_csv('Olympic_Athlete_Biography.csv')
Medal_Tally = pd.read_csv('Olympic_Medal_Tally_History.csv')
Games_Summary = pd.read_csv('Olympic_Games_Summary.csv')
Population_Total = pd.read_csv('population_total_long.csv')

In [46]:
print(Athlete_Events.shape)
print(Event_Results.shape)
print(Athlete_Biography.shape)
print(Medal_Tally.shape)
print(Games_Summary.shape)
print(Population_Total.shape)

(316834, 11)
(7394, 12)
(155861, 10)
(1807, 9)
(64, 11)
(12595, 3)


# Event Results cleaning


In [47]:
print(Event_Results.shape)
print(Event_Results.describe())
# print(Event_Results.head(10))

(7394, 12)
          result_id   edition_id
count  7.394000e+03  7394.000000
mean   1.535217e+06    27.647011
std    5.104084e+06    19.200363
min    1.000000e+00     1.000000
25%    3.002525e+04    13.000000
50%    7.077100e+04    22.000000
75%    3.310878e+05    46.000000
max    9.001677e+07    62.000000


1. Removing duplicates if any.

In [48]:
Event_Results.drop_duplicates(inplace=True)

2. Removing unwanted columns.


In [49]:
Event_Results.drop(['sport_url','result_date','result_location','result_format','result_detail','result_description'], axis=1, inplace=True)


In [50]:
print(Event_Results.shape)

(7394, 6)


3. Will convert all text fields to lower for consistency


In [51]:
Event_Results['event_title'] = Event_Results['event_title'].str.strip().str.lower()
Event_Results['edition'] = Event_Results['edition'].str.strip().str.lower()
Event_Results['sport'] = Event_Results['sport'].str.strip().str.lower()
Event_Results['result_participants'] = Event_Results['result_participants'].str.strip().str.lower()

In [52]:
print(Event_Results['result_participants'].head(10))

0    17 from 15 countries
1    31 from 14 countries
2    43 from 15 countries
3    30 from 14 countries
4    16 from 16 countries
5     15 from 3 countries
6    22 from 14 countries
7      7 from 4 countries
8    53 from 21 countries
9      3 from 2 countries
Name: result_participants, dtype: object


4. Creating new columns with total participants and total participating countries for each event

In [53]:
Event_Results[['participants', 'participant_countries']] = Event_Results['result_participants'].str.extract(r'(\d+)\sfrom\s(\d+)')

Event_Results['participants'] = Event_Results['participants'].astype(int)
Event_Results['participant_countries'] = Event_Results['participant_countries'].astype(int)

Event_Results.drop('result_participants', axis=1, inplace=True)

print(Event_Results)

      result_id                              event_title  \
0         30359  super-heavyweight (>105 kilograms), men   
1          1626                     giant slalom, women1   
2            76                             singles, men   
3           962                        1,500 metres, men   
4        258824            canadian singles, slalom, men   
...         ...                                      ...   
7389   19001250                          basketball, men   
7390      84835                         sabre, team, men   
7391      72031                    foil, individual, men   
7392     258676                  beach volleyball, women   
7393      48121                             doubles, men   

                   edition  edition_id             sport  participants  \
0     2004 summer olympics          26     weightlifting            17   
1     1998 winter olympics          46      snowboarding            31   
2     1976 winter olympics          40              luge 

In [54]:
print(Event_Results['participants'].head(10))
print(Event_Results['participant_countries'].head(10))

0    17
1    31
2    43
3    30
4    16
5    15
6    22
7     7
8    53
9     3
Name: participants, dtype: int64
0    15
1    14
2    15
3    14
4    16
5     3
6    14
7     4
8    21
9     2
Name: participant_countries, dtype: int64


In [55]:
print(Event_Results['event_title'].head(10))

0         super-heavyweight (>105 kilograms), men
1                            giant slalom, women1
2                                    singles, men
3                               1,500 metres, men
4                   canadian singles, slalom, men
5               singles, handicap (class ii), men
6                                   shot put, men
7                    800 metres wheelchair, women
8    30 kilometres (mass start, classical), women
9                  kayak singles, 800 metres, men
Name: event_title, dtype: object


5. Separating male and female events by adding a column for each men and women

In [56]:
Event_Results['men'] = Event_Results['event_title'].str.contains(r'\bmen\b', case=False).astype(int)
Event_Results['women'] = Event_Results['event_title'].str.contains('women', case=False).astype(int)

print(Event_Results)

      result_id                              event_title  \
0         30359  super-heavyweight (>105 kilograms), men   
1          1626                     giant slalom, women1   
2            76                             singles, men   
3           962                        1,500 metres, men   
4        258824            canadian singles, slalom, men   
...         ...                                      ...   
7389   19001250                          basketball, men   
7390      84835                         sabre, team, men   
7391      72031                    foil, individual, men   
7392     258676                  beach volleyball, women   
7393      48121                             doubles, men   

                   edition  edition_id             sport  participants  \
0     2004 summer olympics          26     weightlifting            17   
1     1998 winter olympics          46      snowboarding            31   
2     1976 winter olympics          40              luge 

6. Extract year and olympic type from the data

In [57]:
Event_Results[['year', 'olympic_type']] = Event_Results['edition'].str.extract(r'(\d{4})\s+(summer|winter)\s+olympics')

print(Event_Results['year'].head())
print(Event_Results['olympic_type'].head())

0    2004
1    1998
2    1976
3    1928
4    2008
Name: year, dtype: object
0    summer
1    winter
2    winter
3    winter
4    summer
Name: olympic_type, dtype: object


In [58]:
print(Event_Results.shape)

(7394, 11)


7. Drop rows where essential columns are null

In [59]:
Event_Results.dropna(subset=['event_title', 'sport', 'participants', 'participant_countries', 'men', 'women','year','olympic_type'], inplace=True)

In [60]:
print(Event_Results)

      result_id                              event_title  \
0         30359  super-heavyweight (>105 kilograms), men   
1          1626                     giant slalom, women1   
2            76                             singles, men   
3           962                        1,500 metres, men   
4        258824            canadian singles, slalom, men   
...         ...                                      ...   
7389   19001250                          basketball, men   
7390      84835                         sabre, team, men   
7391      72031                    foil, individual, men   
7392     258676                  beach volleyball, women   
7393      48121                             doubles, men   

                   edition  edition_id             sport  participants  \
0     2004 summer olympics          26     weightlifting            17   
1     1998 winter olympics          46      snowboarding            31   
2     1976 winter olympics          40              luge 

One hot encode olympic type

In [61]:
# One-hot encode the 'olympic_type' column without a prefix
olympic_dummies = pd.get_dummies(Event_Results['olympic_type'])

# Convert the dummy columns from boolean to integer (0/1)
olympic_dummies = olympic_dummies.astype(int)

# Concatenate the one-hot encoded columns back to the original DataFrame
Event_Results = pd.concat([Event_Results, olympic_dummies], axis=1)

print(Event_Results)

      result_id                              event_title  \
0         30359  super-heavyweight (>105 kilograms), men   
1          1626                     giant slalom, women1   
2            76                             singles, men   
3           962                        1,500 metres, men   
4        258824            canadian singles, slalom, men   
...         ...                                      ...   
7389   19001250                          basketball, men   
7390      84835                         sabre, team, men   
7391      72031                    foil, individual, men   
7392     258676                  beach volleyball, women   
7393      48121                             doubles, men   

                   edition  edition_id             sport  participants  \
0     2004 summer olympics          26     weightlifting            17   
1     1998 winter olympics          46      snowboarding            31   
2     1976 winter olympics          40              luge 

In [62]:
print(Event_Results.shape)
print(Event_Results.describe())
print(Event_Results.head(10))

(7375, 13)
          result_id   edition_id  participants  participant_countries  \
count  7.375000e+03  7375.000000   7375.000000            7375.000000   
mean   1.538923e+06    27.672136     40.640407              17.216814   
std    5.110132e+06    19.190632     35.551653              11.030743   
min    1.000000e+00     1.000000      1.000000               1.000000   
25%    2.998250e+04    13.000000     19.000000              10.000000   
50%    7.078800e+04    22.000000     32.000000              16.000000   
75%    3.315630e+05    46.000000     51.000000              23.000000   
max    9.001677e+07    62.000000    312.000000              90.000000   

               men        women       summer       winter  
count  7375.000000  7375.000000  7375.000000  7375.000000  
mean      0.635661     0.273220     0.833627     0.166373  
std       0.481277     0.445643     0.372440     0.372440  
min       0.000000     0.000000     0.000000     0.000000  
25%       0.000000     0.000000