# Tokyo 2020 Athletes Analysis

In [305]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math as math
import datetime as dt
from functools import reduce
from scipy import stats as st
import plotly.express as px
import scipy.stats as stats
import plotly.graph_objects as go
from plotly.graph_objects import Layout

import sys
import warnings
if not sys.warnoptions:
       warnings.simplefilter("ignore")

In [306]:
# upload files

athletes = pd.read_excel('Athletes.xlsx')
coaches = pd.read_excel('Coaches.xlsx')
entriesGender = pd.read_excel('EntriesGender.xlsx')
medals = pd.read_excel('Medals.xlsx')
teams = pd.read_excel('Teams.xlsx')

df_list = [athletes,coaches,entriesGender,medals,teams]

In [307]:
athletes.name = 'athletes'
coaches.name = 'coaches'
entriesGender.name = 'entriesGender'
medals.name = 'medals'
teams.name = 'teams'

In [308]:
display(athletes.sample(5))
display(entriesGender.sample(5))
display(medals.sample(5))
display(teams.sample(5))

Unnamed: 0,Name,NOC,Discipline
10840,YEE Sally,Fiji,Table Tennis
8096,RECBER Hakan,Turkey,Taekwondo
3651,HAGINO Kosuke,Japan,Swimming
8241,ROCHE Mark,Ireland,Rugby Sevens
5698,LOPAS Jack,New Zealand,Rowing


Unnamed: 0,Discipline,Female,Male,Total
33,Skateboarding,40,40,80
34,Sport Climbing,20,20,40
38,Taekwondo,65,65,130
22,Handball,168,168,336
37,Table Tennis,86,86,172


Unnamed: 0,Rank,Team/NOC,Gold,Silver,Bronze,Total,Rank by Total
27,27,Ireland,1,0,1,2,29
46,45,India,0,1,0,1,42
12,13,Hungary,2,1,2,5,15
10,11,Netherlands,2,7,4,13,8
28,27,Uzbekistan,1,0,1,2,29


Unnamed: 0,Name,Discipline,NOC,Event
248,Bansley/Brandie,Beach Volleyball,Canada,Women
233,Republic of Korea,Basketball,Republic of Korea,Women
615,Serbia,Swimming,Serbia,Men's 4 x 100m Freestyle Relay
504,Great Britain,Rugby Sevens,Great Britain,Men
650,France,Table Tennis,France,Men's Team


In [309]:
# check for missing values
for i in df_list:
    display(i.name)
    display(i.isnull().sum())

'athletes'

Name          0
NOC           0
Discipline    0
dtype: int64

'coaches'

Name            0
NOC             0
Discipline      0
Event         145
dtype: int64

'entriesGender'

Discipline    0
Female        0
Male          0
Total         0
dtype: int64

'medals'

Rank             0
Team/NOC         0
Gold             0
Silver           0
Bronze           0
Total            0
Rank by Total    0
dtype: int64

'teams'

Name          0
Discipline    0
NOC           0
Event         0
dtype: int64

## Task 1: check the number of athletes from each country, plot the top 20 countries with most atheletes.

In [310]:
NOC_athletes_cnt = athletes.groupby('NOC')['Name'].count().reset_index().rename(columns = {'Name':'cnt'}).sort_values('cnt', ascending = False)

NOC_athletes_cnt.head(20)

fig = px.bar(NOC_athletes_cnt.head(20), 
             x="NOC", y='cnt', hover_data=['NOC'])


fig.show()

## task 2: show relative compare to country size. 

Link: https://worldpopulationreview.com/

In [315]:
census_data = pd.read_csv('CensusData.csv')
display(census_data.head(5))

Unnamed: 0,cca2,name,pop2021,pop2020,pop2050,pop2030,pop2019,pop2015,pop2010,pop2000,pop1990,pop1980,pop1970,area,Density,GrowthRate,WorldPercentage,rank
0,CN,China,1444216.107,1439323.776,1402405.17,1464340.159,1433783.686,1406847.87,1368810.615,1290550.765,1176883.674,1000089.235,827601.394,9706961,148.7815,1.0034,0.1834,1
1,IN,India,1393409.038,1380004.385,1639176.033,1503642.322,1366417.754,1310152.403,1234281.17,1056575.549,873277.798,698952.844,555189.792,3287590,423.8391,1.0097,0.1769,2
2,US,United States,332915.073,331002.651,379419.102,349641.876,329064.917,320878.31,309011.475,281710.909,252120.309,229476.354,209513.341,9372610,35.52,1.0058,0.0423,3
3,ID,Indonesia,276361.783,273523.615,330904.664,299198.43,270625.568,258383.256,241834.215,211513.823,181413.402,147447.836,114793.178,1904569,145.1046,1.0104,0.0351,4
4,PK,Pakistan,225199.937,220892.34,338013.196,262958.794,216565.318,199426.964,179424.641,142343.578,107647.921,78054.343,58142.06,881912,255.3542,1.0195,0.0286,5


In [317]:
census_2021 = census_data[['cca2','name','pop2021']]
display(census_2021.head(5))

Unnamed: 0,cca2,name,pop2021
0,CN,China,1444216.107
1,IN,India,1393409.038
2,US,United States,332915.073
3,ID,Indonesia,276361.783
4,PK,Pakistan,225199.937


In [319]:
print('test_rep')

test_rep


## Task 3: check which countries show the highest full-represenatation for team sports

In [238]:
teams['Discipline'].nunique()

20

In [272]:
display(teams.head(10))
display(teams.Discipline.nunique())

Unnamed: 0,Name,Discipline,NOC,Event
0,Belgium,3x3 Basketball,Belgium,Men
1,China,3x3 Basketball,People's Republic of China,Men
2,China,3x3 Basketball,People's Republic of China,Women
3,France,3x3 Basketball,France,Women
4,Italy,3x3 Basketball,Italy,Women
5,Japan,3x3 Basketball,Japan,Men
6,Japan,3x3 Basketball,Japan,Women
7,Latvia,3x3 Basketball,Latvia,Men
8,Mongolia,3x3 Basketball,Mongolia,Women
9,Netherlands,3x3 Basketball,Netherlands,Men


20

In [241]:
# test which country send max 

discipline_max_events = teams.groupby('Discipline')['Event'].nunique().reset_index()
discipline_max_events = discipline_max_events.rename(columns = {'Event':'max_events'})
discipline_max_events.head(10)

Unnamed: 0,Discipline,max_events
0,3x3 Basketball,2
1,Archery,3
2,Artistic Gymnastics,2
3,Artistic Swimming,2
4,Athletics,5
5,Baseball/Softball,2
6,Basketball,2
7,Beach Volleyball,2
8,Cycling Track,6
9,Fencing,6


In [242]:
dicipline_NOC = teams.groupby(['Discipline', 'NOC'])['Event'].count().reset_index().rename(columns={'Event':'No_of_Represtative'})
dicipline_NOC = dicipline_NOC.merge(discipline_max_events[['Discipline','max_events']] , on = 'Discipline', how = 'left')

In [243]:
dicipline_NOC.head(10)
#
dicipline_NOC['Full_representation'] = np.where((dicipline_NOC['No_of_Represtative'] / dicipline_NOC['max_events']) == 1, 1, 0)

dicipline_NOC_full_rep = dicipline_NOC.drop(['No_of_Represtative','max_events'], axis = 'columns')

In [273]:
dicipline_NOC_full_rep.sort_values('Full_representation', ascending = False).head(10)

Unnamed: 0,Discipline,NOC,Full_representation
399,Water Polo,United States of America,1
322,Swimming,United States of America,1
327,Table Tennis,Chinese Taipei,1
102,Athletics,Italy,1
103,Athletics,Jamaica,1
259,Hockey,New Zealand,1
105,Athletics,Netherlands,1
261,Hockey,South Africa,1
323,Table Tennis,Australia,1
262,Hockey,Spain,1


In [274]:
percentages = dicipline_NOC_full_rep.groupby('NOC')['Full_representation'].count().reset_index()
percentages.sort_values('Full_representation', ascending = False).head(5)

Unnamed: 0,NOC,Full_representation
42,Japan,20
79,United States of America,18
57,People's Republic of China,17
13,Canada,17
2,Australia,15


In [246]:
#
percentages['Full_representation'] = (percentages['Full_representation'] / dicipline_NOC_full_rep['Discipline'].nunique())

#
precentages = percentages.sort_values('Full_representation', ascending = False)

precentages
# problem to solve - not all countries send represenative at all! Intresting.

Unnamed: 0,NOC,Full_representation
42,Japan,1.00
79,United States of America,0.90
57,People's Republic of China,0.85
13,Canada,0.85
2,Australia,0.75
...,...,...
41,Jamaica,0.05
46,Liechtenstein,0.05
48,Malaysia,0.05
52,Morocco,0.05


In [271]:
# create an interactive graph of top 20 countries with most full representative


# create an interactive graph of top 20 countries with relative full representative
fig = px.bar(percentages.sort_values('Full_representation',ascending = False).iloc[0:30], 
             x="NOC", y='Full_representation', hover_data=['Full_representation'])

# tickangle xaxis
fig.update_layout(xaxis_tickangle=-45)


fig.show()

In [None]:
## Task 2 

In [68]:
import requests
import json

In [75]:
r = requests.get(url = 'https://private-anon-6a7d3eb5a8-olympicsapi.apiary-mock.com/scrape/countries')
rtext = requests.get(url = 'https://private-anon-6a7d3eb5a8-olympicsapi.apiary-mock.com/scrape/countries').text

In [81]:
print(r.content)
print(r.text)

b'{\n    "0":{\n        "id":               Country_ID_as_int,\n        "name":             "Country Name",\n        "olympics-hosted":  [\n                                {\n                                    "id":       "Olympics_ID",\n                                    "year":     "Year"\n                                    "season":   "Season type (Summer/Winter)",   \n                                    "city":     "City Name"\n                                }\n                            ]\n    }\n}'


In [77]:
display(help(r))

Help on Response in module requests.models object:

class Response(builtins.object)
 |  The :class:`Response <Response>` object, which contains a
 |  server's response to an HTTP request.
 |  
 |  Methods defined here:
 |  
 |  __bool__(self)
 |      Returns True if :attr:`status_code` is less than 400.
 |      
 |      This attribute checks if the status code of the response is between
 |      400 and 600 to see if there was a client error or a server error. If
 |      the status code, is between 200 and 400, this will return True. This
 |      is **not** a check to see if the response code is ``200 OK``.
 |  
 |  __enter__(self)
 |  
 |  __exit__(self, *args)
 |  
 |  __getstate__(self)
 |  
 |  __init__(self)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  __iter__(self)
 |      Allows you to use a response as an iterator.
 |  
 |  __nonzero__(self)
 |      Returns True if :attr:`status_code` is less than 400.
 |      
 |      This attribute checks if

None

In [None]:
# check how many of team players of of the ball category

# check the total medals by country

# connect to a new data source of census data - find the highest relatibe number of medals 