# Tokyo 2020 Athletes Analysis

In [282]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math as math
import datetime as dt
from functools import reduce
from scipy import stats as st
import plotly.express as px
import scipy.stats as stats
import plotly.graph_objects as go
from plotly.graph_objects import Layout

import sys
import warnings
if not sys.warnoptions:
       warnings.simplefilter("ignore")

In [283]:
# upload files

athletes = pd.read_excel('Athletes.xlsx')
coaches = pd.read_excel('Coaches.xlsx')
entriesGender = pd.read_excel('EntriesGender.xlsx')
medals = pd.read_excel('Medals.xlsx')
teams = pd.read_excel('Teams.xlsx')

df_list = [athletes,coaches,entriesGender,medals,teams]

In [284]:
athletes.name = 'athletes'
coaches.name = 'coaches'
entriesGender.name = 'entriesGender'
medals.name = 'medals'
teams.name = 'teams'

In [285]:
display(athletes.sample(5))
display(entriesGender.sample(5))
display(medals.sample(5))
display(teams.sample(5))

Unnamed: 0,Name,NOC,Discipline
9363,STRAHL Martina,Switzerland,Athletics
6484,MIRON Javier,Spain,Athletics
4989,KOCH Marc,Germany,Athletics
784,BARYSEVICH Darya,Belarus,Athletics
11032,ZHENG Shuyin,People's Republic of China,Taekwondo


Unnamed: 0,Discipline,Female,Male,Total
45,Wrestling,96,193,289
10,Canoe Slalom,41,41,82
13,Cycling BMX Racing,24,24,48
31,Sailing,175,175,350
41,Triathlon,55,55,110


Unnamed: 0,Rank,Team/NOC,Gold,Silver,Bronze,Total,Rank by Total
35,30,Philippines,1,0,0,1,42
57,58,Argentina,0,0,1,1,42
50,45,Turkmenistan,0,1,0,1,42
30,30,Ecuador,1,0,0,1,42
46,45,India,0,1,0,1,42


Unnamed: 0,Name,Discipline,NOC,Event
432,Angola,Handball,Angola,Women
474,New Zealand,Hockey,New Zealand,Men
410,Chile,Football,Chile,Women
515,South Africa,Rugby Sevens,South Africa,Men
577,Hungary,Swimming,Hungary,Women's 4 x 200m Freestyle Relay


In [286]:
# check for missing values
for i in df_list:
    display(i.name)
    display(i.isnull().sum())

'athletes'

Name          0
NOC           0
Discipline    0
dtype: int64

'coaches'

Name            0
NOC             0
Discipline      0
Event         145
dtype: int64

'entriesGender'

Discipline    0
Female        0
Male          0
Total         0
dtype: int64

'medals'

Rank             0
Team/NOC         0
Gold             0
Silver           0
Bronze           0
Total            0
Rank by Total    0
dtype: int64

'teams'

Name          0
Discipline    0
NOC           0
Event         0
dtype: int64

## Task 1: check the number of athletes from each country, plot the top 20 countries with most atheletes.

In [287]:
NOC_athletes_cnt = athletes.groupby('NOC')['Name'].count().reset_index().rename(columns = {'Name':'cnt'}).sort_values('cnt', ascending = False)

NOC_athletes_cnt.head(20)

fig = px.bar(NOC_athletes_cnt.head(20), 
             x="NOC", y='cnt', hover_data=['NOC'])


fig.show()

## task 2: show relative compare to country size. 

Link: https://worldpopulationreview.com/

In [288]:
census_data = pd.read_csv('CensusData.csv')
census_data

Unnamed: 0,cca2,name,pop2021,pop2020,pop2050,pop2030,pop2019,pop2015,pop2010,pop2000,pop1990,pop1980,pop1970,area,Density,GrowthRate,WorldPercentage,rank
0,CN,China,1444216.107,1439323.776,1402405.170,1464340.159,1433783.686,1406847.870,1368810.615,1290550.765,1176883.674,1000089.235,827601.394,9706961,148.7815,1.0034,0.1834,1
1,IN,India,1393409.038,1380004.385,1639176.033,1503642.322,1366417.754,1310152.403,1234281.170,1056575.549,873277.798,698952.844,555189.792,3287590,423.8391,1.0097,0.1769,2
2,US,United States,332915.073,331002.651,379419.102,349641.876,329064.917,320878.310,309011.475,281710.909,252120.309,229476.354,209513.341,9372610,35.5200,1.0058,0.0423,3
3,ID,Indonesia,276361.783,273523.615,330904.664,299198.430,270625.568,258383.256,241834.215,211513.823,181413.402,147447.836,114793.178,1904569,145.1046,1.0104,0.0351,4
4,PK,Pakistan,225199.937,220892.340,338013.196,262958.794,216565.318,199426.964,179424.641,142343.578,107647.921,78054.343,58142.060,881912,255.3542,1.0195,0.0286,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227,MS,Montserrat,4.977,4.992,4.153,4.763,4.989,4.967,4.899,4.929,10.615,11.607,11.534,102,48.7941,0.9970,0.0000,228
228,FK,Falkland Islands,3.533,3.480,3.243,3.436,3.377,2.834,2.901,2.892,1.982,1.854,1.992,12173,0.2902,1.0152,0.0000,229
229,NU,Niue,1.619,1.626,1.778,1.664,1.615,1.619,1.618,1.899,2.329,3.404,5.135,260,6.2269,0.9957,0.0000,230
230,TK,Tokelau,1.373,1.357,1.565,1.448,1.340,1.252,1.140,1.554,1.608,1.553,1.621,12,114.4167,1.0118,0.0000,231


In [289]:
# fix the numbers to millions 
display(census_data.info()) # pop data is numeric 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232 entries, 0 to 231
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   cca2             231 non-null    object 
 1   name             232 non-null    object 
 2   pop2021          232 non-null    float64
 3   pop2020          232 non-null    float64
 4   pop2050          232 non-null    float64
 5   pop2030          232 non-null    float64
 6   pop2019          232 non-null    float64
 7   pop2015          232 non-null    float64
 8   pop2010          232 non-null    float64
 9   pop2000          232 non-null    float64
 10  pop1990          232 non-null    float64
 11  pop1980          232 non-null    float64
 12  pop1970          232 non-null    float64
 13  area             232 non-null    int64  
 14  Density          232 non-null    float64
 15  GrowthRate       232 non-null    float64
 16  WorldPercentage  232 non-null    float64
 17  rank            

None

In [293]:
#
pop_cols = [col for col in census_data.columns if 'pop' in col]

#
#for i in pop_cols:
#    census_data[i] = census_data[i].astype(str)
#    census_data[i] = census_data[i].str.replace('\.','').astype(int)

    
for i in pop_cols:
    census_data[i] = census_data[i] * 1000
    census_data[i] = census_data[i].astype(int)


In [294]:
census_data

Unnamed: 0,cca2,name,pop2021,pop2020,pop2050,pop2030,pop2019,pop2015,pop2010,pop2000,pop1990,pop1980,pop1970,area,Density,GrowthRate,WorldPercentage,rank
0,CN,China,1444216107,1439323775,1402405170,1464340159,1433783686,1406847870,1368810615,1290550765,1176883674,1000089235,827601394,9706961,148.7815,1.0034,0.1834,1
1,IN,India,1393409038,1380004385,1639176033,1503642322,1366417754,1310152403,1234281170,1056575549,873277798,698952844,555189792,3287590,423.8391,1.0097,0.1769,2
2,US,United States,332915073,331002651,379419102,349641876,329064917,320878310,309011475,281710909,252120308,229476354,209513341,9372610,35.5200,1.0058,0.0423,3
3,ID,Indonesia,276361783,273523615,330904664,299198430,270625568,258383256,241834215,211513823,181413402,147447836,114793178,1904569,145.1046,1.0104,0.0351,4
4,PK,Pakistan,225199937,220892340,338013196,262958794,216565318,199426963,179424641,142343578,107647921,78054343,58142060,881912,255.3542,1.0195,0.0286,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227,MS,Montserrat,4977,4992,4153,4763,4989,4967,4899,4928,10615,11607,11534,102,48.7941,0.9970,0.0000,228
228,FK,Falkland Islands,3533,3480,3243,3436,3377,2833,2901,2892,1982,1854,1992,12173,0.2902,1.0152,0.0000,229
229,NU,Niue,1619,1626,1778,1664,1615,1619,1618,1899,2329,3404,5135,260,6.2269,0.9957,0.0000,230
230,TK,Tokelau,1373,1357,1565,1448,1340,1252,1140,1554,1608,1553,1621,12,114.4167,1.0118,0.0000,231


In [199]:
census_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232 entries, 0 to 231
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   cca2             231 non-null    object 
 1   name             232 non-null    object 
 2   pop2021          232 non-null    float64
 3   pop2020          232 non-null    float64
 4   pop2050          232 non-null    float64
 5   pop2030          232 non-null    float64
 6   pop2019          232 non-null    float64
 7   pop2015          232 non-null    float64
 8   pop2010          232 non-null    float64
 9   pop2000          232 non-null    float64
 10  pop1990          232 non-null    float64
 11  pop1980          232 non-null    float64
 12  pop1970          232 non-null    float64
 13  area             232 non-null    int64  
 14  Density          232 non-null    float64
 15  GrowthRate       232 non-null    float64
 16  WorldPercentage  232 non-null    float64
 17  rank            

In [296]:
# slice the data for 2021
census_2021 = census_data[['name','pop2021']]

In [56]:
# list of suspected alias
usa_alias = ['United States', 'United States of America', 'US', 'U.S', 'USA', 'U.S.A'] # choose United States of America
russia_alias = ['Russia', 'ROC','Russian Federation'] # choose Russia
china_alias = ['People\'s Republic of China', 'China'] # choose China
uk_alias = ['Great Britain', 'United Kingdom', 'UK'] # Choose Great Britain
south_korea_alias = ['Republic of Korea', 'South Korea'] # Choose South Korea

In [52]:
### check all suspects 

# Korea, United States, china, U,russia 

display(census_2021[census_2021['name'].str.contains("Korea")])
display(census_2021[census_2021['name'].str.contains("United")])
display(census_2021[census_2021['name'].str.contains("America")])
display(census_2021[census_2021['name'].str.contains("England")])
display(census_2021[census_2021['name'].str.contains("Russ")])


Unnamed: 0,name,pop2021
27,South Korea,51305.186
53,North Korea,25887.041


Unnamed: 0,name,pop2021
2,United States,332915.073
20,United Kingdom,68207.116
92,United Arab Emirates,9991.089
197,United States Virgin Islands,104.226


Unnamed: 0,name,pop2021
208,American Samoa,55.1


Unnamed: 0,name,pop2021


Unnamed: 0,name,pop2021
8,Russia,145912.025


In [65]:
census_2021['name'].str.isin(@usa_alias) ### CHECK NOTEBOOK! 

SyntaxError: invalid syntax (<ipython-input-65-4a7e98e9e29c>, line 1)

In [32]:
# census_2021 = census_2021.where(census_2021['name'] == 'United States', ... 

#df_gdp_share.loc[df_gdp_share['share_of_world_gdp_%'] < 1, 'country_edit'] = 'rest_of_the_world'

census_2021.loc[census_2021['name'] == 'United States', 'name'] = 'Unites States of America'

In [33]:
census_2021

Unnamed: 0,name,pop2021
0,China,1444216.107
1,India,1393409.038
2,Unites States of America,332915.073
3,Indonesia,276361.783
4,Pakistan,225199.937
...,...,...
227,Montserrat,4.977
228,Falkland Islands,3.533
229,Niue,1.619
230,Tokelau,1.373


In [None]:
def countries_alias(df,column):
        for row in df.column:
            words = nltk.word_tokenize(row)
            for word in words:
                if word == 'United States' or word == 'United States of America' or word == 'US' or word == 'U.S':
                df.column.replace(row, "real_estate", inplace = True)
        
                if word == 'university' or word == 'college':
                df.column.replace(row, "education", inplace = True)
                
                if word == 'wedding':
                df.column.replace(row, "wedding", inplace = True)
                

In [14]:
#athletes_per_mil = pd.merge(NOC_athletes_cnt, census_2021, left_on=  ['userid', 'column1'],
#                   right_on= ['username', 'column1'], 
#                   how = 'left') 

## Task 3: check which countries show the highest full-represenatation for team sports

In [238]:
teams['Discipline'].nunique()

20

In [272]:
display(teams.head(10))
display(teams.Discipline.nunique())

Unnamed: 0,Name,Discipline,NOC,Event
0,Belgium,3x3 Basketball,Belgium,Men
1,China,3x3 Basketball,People's Republic of China,Men
2,China,3x3 Basketball,People's Republic of China,Women
3,France,3x3 Basketball,France,Women
4,Italy,3x3 Basketball,Italy,Women
5,Japan,3x3 Basketball,Japan,Men
6,Japan,3x3 Basketball,Japan,Women
7,Latvia,3x3 Basketball,Latvia,Men
8,Mongolia,3x3 Basketball,Mongolia,Women
9,Netherlands,3x3 Basketball,Netherlands,Men


20

In [241]:
# test which country send max 

discipline_max_events = teams.groupby('Discipline')['Event'].nunique().reset_index()
discipline_max_events = discipline_max_events.rename(columns = {'Event':'max_events'})
discipline_max_events.head(10)

Unnamed: 0,Discipline,max_events
0,3x3 Basketball,2
1,Archery,3
2,Artistic Gymnastics,2
3,Artistic Swimming,2
4,Athletics,5
5,Baseball/Softball,2
6,Basketball,2
7,Beach Volleyball,2
8,Cycling Track,6
9,Fencing,6


In [242]:
dicipline_NOC = teams.groupby(['Discipline', 'NOC'])['Event'].count().reset_index().rename(columns={'Event':'No_of_Represtative'})
dicipline_NOC = dicipline_NOC.merge(discipline_max_events[['Discipline','max_events']] , on = 'Discipline', how = 'left')

In [243]:
dicipline_NOC.head(10)
#
dicipline_NOC['Full_representation'] = np.where((dicipline_NOC['No_of_Represtative'] / dicipline_NOC['max_events']) == 1, 1, 0)

dicipline_NOC_full_rep = dicipline_NOC.drop(['No_of_Represtative','max_events'], axis = 'columns')

In [273]:
dicipline_NOC_full_rep.sort_values('Full_representation', ascending = False).head(10)

Unnamed: 0,Discipline,NOC,Full_representation
399,Water Polo,United States of America,1
322,Swimming,United States of America,1
327,Table Tennis,Chinese Taipei,1
102,Athletics,Italy,1
103,Athletics,Jamaica,1
259,Hockey,New Zealand,1
105,Athletics,Netherlands,1
261,Hockey,South Africa,1
323,Table Tennis,Australia,1
262,Hockey,Spain,1


In [274]:
percentages = dicipline_NOC_full_rep.groupby('NOC')['Full_representation'].count().reset_index()
percentages.sort_values('Full_representation', ascending = False).head(5)

Unnamed: 0,NOC,Full_representation
42,Japan,20
79,United States of America,18
57,People's Republic of China,17
13,Canada,17
2,Australia,15


In [246]:
#
percentages['Full_representation'] = (percentages['Full_representation'] / dicipline_NOC_full_rep['Discipline'].nunique())

#
precentages = percentages.sort_values('Full_representation', ascending = False)

precentages
# problem to solve - not all countries send represenative at all! Intresting.

Unnamed: 0,NOC,Full_representation
42,Japan,1.00
79,United States of America,0.90
57,People's Republic of China,0.85
13,Canada,0.85
2,Australia,0.75
...,...,...
41,Jamaica,0.05
46,Liechtenstein,0.05
48,Malaysia,0.05
52,Morocco,0.05


In [271]:
# create an interactive graph of top 20 countries with most full representative


# create an interactive graph of top 20 countries with relative full representative
fig = px.bar(percentages.sort_values('Full_representation',ascending = False).iloc[0:30], 
             x="NOC", y='Full_representation', hover_data=['Full_representation'])

# tickangle xaxis
fig.update_layout(xaxis_tickangle=-45)


fig.show()

In [None]:
## Task 2 

In [68]:
import requests
import json

In [75]:
r = requests.get(url = 'https://private-anon-6a7d3eb5a8-olympicsapi.apiary-mock.com/scrape/countries')
rtext = requests.get(url = 'https://private-anon-6a7d3eb5a8-olympicsapi.apiary-mock.com/scrape/countries').text

In [81]:
print(r.content)
print(r.text)

b'{\n    "0":{\n        "id":               Country_ID_as_int,\n        "name":             "Country Name",\n        "olympics-hosted":  [\n                                {\n                                    "id":       "Olympics_ID",\n                                    "year":     "Year"\n                                    "season":   "Season type (Summer/Winter)",   \n                                    "city":     "City Name"\n                                }\n                            ]\n    }\n}'


In [77]:
display(help(r))

Help on Response in module requests.models object:

class Response(builtins.object)
 |  The :class:`Response <Response>` object, which contains a
 |  server's response to an HTTP request.
 |  
 |  Methods defined here:
 |  
 |  __bool__(self)
 |      Returns True if :attr:`status_code` is less than 400.
 |      
 |      This attribute checks if the status code of the response is between
 |      400 and 600 to see if there was a client error or a server error. If
 |      the status code, is between 200 and 400, this will return True. This
 |      is **not** a check to see if the response code is ``200 OK``.
 |  
 |  __enter__(self)
 |  
 |  __exit__(self, *args)
 |  
 |  __getstate__(self)
 |  
 |  __init__(self)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  __iter__(self)
 |      Allows you to use a response as an iterator.
 |  
 |  __nonzero__(self)
 |      Returns True if :attr:`status_code` is less than 400.
 |      
 |      This attribute checks if

None

In [None]:
# check how many of team players of of the ball category

# check the total medals by country

# connect to a new data source of census data - find the highest relatibe number of medals 