In [16]:
import pandas as pd
from scipy.stats import spearmanr

In [13]:
cardio_alco = pd.read_csv(r'data/cardio_alco.csv', delimiter=';')
cardio_base = pd.read_csv(r'data/cardio_base.csv')
covid_data = pd.read_csv(r'data/covid_data.csv')

In [14]:
cardio_alco.head(10)

Unnamed: 0,id,alco
0,44,0
1,45,0
2,46,0
3,47,0
4,49,0
5,51,0
6,52,0
7,53,0
8,54,0
9,56,0


In [11]:
cardio_base.head(10)

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,smoke
0,0,18393,2,168,62.0,110,80,1,0
1,1,20228,1,156,85.0,140,90,3,0
2,2,18857,1,165,64.0,130,70,3,0
3,3,17623,2,169,82.0,150,100,1,0
4,4,17474,1,156,56.0,100,60,1,0
5,8,21914,1,151,67.0,120,80,2,0
6,9,22113,1,157,93.0,130,80,3,0
7,12,22584,2,178,95.0,130,90,3,0
8,13,17668,1,158,71.0,110,70,1,0
9,14,19834,1,164,68.0,110,60,1,0


In [12]:
covid_data.head(10)

Unnamed: 0,location,date,new_cases,new_deaths,population,aged_65_older_percent,gdp_per_capita,hospital_beds_per_thousand
0,Afghanistan,2019-12-31,0,0,38928341.0,2.581,1803.987,0.5
1,Afghanistan,2020-01-01,0,0,38928341.0,2.581,1803.987,0.5
2,Afghanistan,2020-01-02,0,0,38928341.0,2.581,1803.987,0.5
3,Afghanistan,2020-01-03,0,0,38928341.0,2.581,1803.987,0.5
4,Afghanistan,2020-01-04,0,0,38928341.0,2.581,1803.987,0.5
5,Afghanistan,2020-01-05,0,0,38928341.0,2.581,1803.987,0.5
6,Afghanistan,2020-01-06,0,0,38928341.0,2.581,1803.987,0.5
7,Afghanistan,2020-01-07,0,0,38928341.0,2.581,1803.987,0.5
8,Afghanistan,2020-01-08,0,0,38928341.0,2.581,1803.987,0.5
9,Afghanistan,2020-01-09,0,0,38928341.0,2.581,1803.987,0.5


# Calculating spearman correlation rank

In [18]:
spearman_corr = cardio_base.corr(method='spearman')

In [23]:
spearman_corr['age']['weight']

0.06155893830937044

In [26]:
spearman_corr['age']['ap_hi']

0.21908652588163743

In [27]:
spearman_corr['age']['ap_lo']

0.15669952606369209

In [28]:
spearman_corr['ap_hi']['ap_lo']

0.7354361051379777

In [30]:
spearman_corr['gender']['height']

0.5338045129524459

# Percentage of the population over 50 years cumnsuming alcohol

In [33]:
cardio_merged = pd.merge(cardio_base, cardio_alco, how='inner')

In [36]:
cardio_merged['age_years'] = cardio_merged['age']/365

In [51]:
100 * cardio_merged[(cardio_merged['age_years'] > 50) & (cardio_merged['alco'] == 1)].size/cardio_merged[cardio_merged['age_years'] > 50].size

4.954806694179305

### Calculate F1 score for statement that countries where more than 20% of the population is over 65 years old, have death rate over 50 per milliom inhabitants

In [53]:
from sklearn.metrics import f1_score

In [67]:
covid_data.sample(5)

Unnamed: 0,location,date,new_cases,new_deaths,population,aged_65_older_percent,gdp_per_capita,hospital_beds_per_thousand
21077,Tunisia,2020-03-11,3,0,11818618.0,8.001,10849.297,2.3
7661,Gabon,2020-06-03,148,3,2225728.0,4.45,16562.413,6.3
1113,Australia,2020-02-18,0,0,25499881.0,15.504,44648.71,3.84
20331,Syria,2020-05-03,0,0,17500657.0,,,1.5
8714,Guatemala,2020-05-09,68,1,17915567.0,4.694,7423.808,0.6


get the latest population and aged_65_older_percent for each location

In [78]:
covid_data_latest = covid_data.sort_values('date', ascending=False).groupby('location', as_index=False).apply(lambda x: x.iloc[0])

In [81]:
covid_data_latest.sample(5)

Unnamed: 0,location,date,new_cases,new_deaths,population,aged_65_older_percent,gdp_per_capita,hospital_beds_per_thousand
151,Peru,2020-06-10,4040,167,32971846.0,7.151,12236.706,1.6
179,Sri Lanka,2020-06-10,2,0,21413250.0,10.069,11669.077,3.6
154,Portugal,2020-06-10,421,7,10196707.0,21.502,27936.896,3.39
45,Costa Rica,2020-06-10,33,1,5094114.0,9.468,15524.995,1.13
80,Guatemala,2020-06-10,364,22,17915567.0,4.694,7423.808,0.6


In [87]:
covid_aged_65_over_20p = covid_data_latest[covid_data_latest['aged_65_older_percent'] > 20]

get the total number of deaths for the entire period for each location


In [79]:
covid_data_deaths = covid_data.groupby(by=['location'], as_index=False).sum()[['location', 'new_deaths']]

In [82]:
covid_data_deaths.sample(5)

Unnamed: 0,location,new_deaths
119,Malawi,4
89,Iceland,10
200,United States,112006
25,Bosnia and Herzegovina,159
59,El Salvador,60


merge datasets

In [89]:
covid_data_merged = pd.merge(covid_data_deaths, covid_aged_65_over_20p, on='location', how='inner')

In [95]:
covid_data_merged

Unnamed: 0,location,new_deaths_x,date,new_cases,new_deaths_y,population,aged_65_older_percent,gdp_per_capita,hospital_beds_per_thousand
0,Bulgaria,167,2020-06-10,79,3,6948445.0,20.801,18563.307,7.454
1,Finland,324,2020-06-10,24,1,5540718.0,21.228,40585.721,3.28
2,Germany,8729,2020-06-10,318,18,83783945.0,21.453,45229.245,8.0
3,Greece,183,2020-06-10,9,1,10423056.0,20.396,24574.382,4.21
4,Italy,34043,2020-06-10,283,79,60461828.0,23.021,35220.084,3.18
5,Japan,919,2020-06-10,41,3,126476458.0,27.049,39002.223,13.05
6,Portugal,1492,2020-06-10,421,7,10196707.0,21.502,27936.896,3.39


In [91]:
covid_data_merged[(covid_data_merged['new_deaths_x']/covid_data_merged['population']) > 50/1000000]

Unnamed: 0,location,new_deaths_x,date,new_cases,new_deaths_y,population,aged_65_older_percent,gdp_per_capita,hospital_beds_per_thousand
1,Finland,324,2020-06-10,24,1,5540718.0,21.228,40585.721,3.28
2,Germany,8729,2020-06-10,318,18,83783945.0,21.453,45229.245,8.0
4,Italy,34043,2020-06-10,283,79,60461828.0,23.021,35220.084,3.18
6,Portugal,1492,2020-06-10,421,7,10196707.0,21.502,27936.896,3.39


In [92]:
covid_data_merged[(covid_data_merged['new_deaths_x']/covid_data_merged['population']) < 50/1000000]

Unnamed: 0,location,new_deaths_x,date,new_cases,new_deaths_y,population,aged_65_older_percent,gdp_per_capita,hospital_beds_per_thousand
0,Bulgaria,167,2020-06-10,79,3,6948445.0,20.801,18563.307,7.454
3,Greece,183,2020-06-10,9,1,10423056.0,20.396,24574.382,4.21
5,Japan,919,2020-06-10,41,3,126476458.0,27.049,39002.223,13.05


F1 Score = 2 * (Precision * Recall) / (Precision + Recall)

Precision: Correct positive predictions relative to total positive predictions

Recall: Correct positive predictions relative to total actual positives

In [96]:
4/7

0.5714285714285714

# When did the difference in the total number of confirmed cases between Italy and Germany become more than 10000

In [129]:
covid_data.date = pd.to_datetime(covid_data.date)

In [142]:
covid_data_italy = covid_data[covid_data.location == 'Italy'].groupby(by=['date']).sum()
covid_data_italy = covid_data_italy.groupby(covid_data_italy.index.year).cumsum().reset_index()

In [143]:
covid_data_germany = covid_data[covid_data.location == 'Germany'].groupby(by=['date']).sum()
covid_data_germany = covid_data_germany.groupby(covid_data_germany.index.year).cumsum().reset_index()

In [148]:
covid_data_germany_italy = pd.merge(covid_data_italy, covid_data_germany, on='date', suffixes=['_italy', '_germany'])

In [157]:
covid_data_germany_italy['new_cases_diff'] = abs(covid_data_germany_italy['new_cases_italy']-covid_data_germany_italy['new_cases_germany'])

In [159]:
covid_data_germany_italy[covid_data_germany_italy['new_cases_diff'] > 10000].sort_values(by=['date'])[['date', 'new_cases_diff']].head(1)

Unnamed: 0,date,new_cases_diff
72,2020-03-12,10895


# Fit an exponential function for Italy's cases between 2020-02-28 and 2020-03-20

In [161]:
covid_data_italy.dtypes

date                          datetime64[ns]
new_cases                              int64
new_deaths                             int64
population                           float64
aged_65_older_percent                float64
gdp_per_capita                       float64
hospital_beds_per_thousand           float64
dtype: object

In [163]:
pd.to_datetime('28-02-2020')

Timestamp('2020-02-28 00:00:00')

In [162]:
datetime.strptime('28/02/2020', '%d %m %y')

ValueError: time data '28/02/2020' does not match format '%d %m %y'

In [167]:
covid_data_italy[(covid_data_italy['date'] >= pd.to_datetime('28-02-2020')) & (covid_data_italy['date'] <= pd.to_datetime('20-03-2020'))]

Unnamed: 0,date,new_cases,new_deaths,population,aged_65_older_percent,gdp_per_capita,hospital_beds_per_thousand
59,2020-02-28,650,17,3567248000.0,1358.239,2077984.956,187.62
60,2020-02-29,888,21,3627710000.0,1381.26,2113205.04,190.8
61,2020-03-01,1128,29,3688172000.0,1404.281,2148425.124,193.98
62,2020-03-02,1689,35,3748633000.0,1427.302,2183645.208,197.16
63,2020-03-03,2036,52,3809095000.0,1450.323,2218865.292,200.34
64,2020-03-04,2502,80,3869557000.0,1473.344,2254085.376,203.52
65,2020-03-05,3089,107,3930019000.0,1496.365,2289305.46,206.7
66,2020-03-06,3858,148,3990481000.0,1519.386,2324525.544,209.88
67,2020-03-07,4636,197,4050942000.0,1542.407,2359745.628,213.06
68,2020-03-08,5883,233,4111404000.0,1565.428,2394965.712,216.24


# Percentage of people more than 2 standart deviations far from the average height

In [174]:
height_avg = cardio_base.height.mean()

In [175]:
height_std = cardio_base.height.std()

In [178]:
low = height_avg - (2 * height_std)
high = height_avg + (2 * height_std)

In [183]:
100*cardio_base[(cardio_base['height'] < low) | (cardio_base['height'] > high)].size/cardio_base.size

3.335714285714286

# Are men more likely to be a smoker?

In [186]:
cardio_base.groupby(by=['gender']).mean()

Unnamed: 0_level_0,id,age,height,weight,ap_hi,ap_lo,cholesterol,smoke
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,49898.345267,19510.124577,161.355612,72.565605,128.139249,94.522776,1.384735,0.017856
2,50110.246547,19392.097875,169.947895,77.257307,130.078872,100.551982,1.333633,0.21888


In [187]:
cardio_base_men = cardio_base[cardio_base['gender'] == 2]
cardio_base_women = cardio_base[cardio_base['gender'] == 1]

In [196]:
cardio_base_men.sum()

id             1.226198e+09
age            4.745246e+08
gender         4.894000e+04
height         4.158625e+06
weight         1.890486e+06
ap_hi          3.183030e+06
ap_lo          2.460507e+06
cholesterol    3.263400e+04
smoke          5.356000e+03
dtype: float64

In [197]:
cardio_base_women.sum()

id             2.271872e+09
age            8.882960e+08
gender         4.553000e+04
height         7.346521e+06
weight         3.303912e+06
ap_hi          5.834180e+06
ap_lo          4.303622e+06
cholesterol    6.304700e+04
smoke          8.130000e+02
dtype: float64

In [192]:
men_smoker_chance = cardio_base_men[cardio_base_men['smoke']==1].size/cardio_base_men.size
women_smoker_chance = cardio_base_women[cardio_base_women['smoke']==1].size/cardio_base_women.size

In [193]:
men_smoker_chance/women_smoker_chance

12.257833097333833

# How much heavier is the group with the highest average weight than the group with the lowest weight?

In [201]:
cardio_base['age_years'] = round(cardio_base['age']/365).astype(int)

In [208]:
age_years_mean = cardio_base.groupby('age_years').mean().sort_values(by='weight')

In [219]:
age_years_mean.tail(1).weight.item()/age_years_mean.head(1).weight.item()

1.1213754646840148

# Which country has the 3rd highest death rate?

In [226]:
covid_data_deaths = covid_data_deaths.rename(columns={'new_deaths': 'total_deaths'})

In [222]:
covid_data_latest.head()

Unnamed: 0,location,date,new_cases,new_deaths,population,aged_65_older_percent,gdp_per_capita,hospital_beds_per_thousand
0,Afghanistan,2020-06-10,542,15,38928341.0,2.581,1803.987,0.5
1,Albania,2020-06-10,36,0,2877800.0,13.188,11803.431,2.89
2,Algeria,2020-06-10,117,9,43851043.0,6.211,13913.839,1.9
3,Andorra,2020-06-10,0,0,77265.0,,,
4,Angola,2020-06-10,4,0,32866268.0,2.405,5819.495,


In [228]:
covid_data_latest_deaths = pd.merge(covid_data_latest, covid_data_deaths, on='location', how='inner')

In [230]:
covid_data_latest_deaths['death_to_population'] = covid_data_latest_deaths.total_deaths/covid_data_latest_deaths.population

In [239]:
covid_data_latest_deaths.sort_values(by='death_to_population',ascending=False).head(3).loc[3, 'location']

'Andorra'

# Probability country had gdp over 10000 iif they have at least 5 hospital beds per 1000 inhabitants

In [243]:
covid_data_latest.head(3)

Unnamed: 0,location,date,new_cases,new_deaths,population,aged_65_older_percent,gdp_per_capita,hospital_beds_per_thousand
0,Afghanistan,2020-06-10,542,15,38928341.0,2.581,1803.987,0.5
1,Albania,2020-06-10,36,0,2877800.0,13.188,11803.431,2.89
2,Algeria,2020-06-10,117,9,43851043.0,6.211,13913.839,1.9


In [248]:
covid_data_latest[(covid_data_latest['hospital_beds_per_thousand']>=5) & (covid_data_latest['gdp_per_capita']>10000)].size/covid_data_latest[(covid_data_latest['hospital_beds_per_thousand']>=5)].size

0.8214285714285714

In [251]:
covid_data[(covid_data['hospital_beds_per_thousand']>=5) & (covid_data['gdp_per_capita']>10000)].size/covid_data[(covid_data['hospital_beds_per_thousand']>=5)].size

0.8565304372677908

# How tall is the tallest 1% of people

In [256]:
100*(cardio_base[cardio_base['height']>=184].size/cardio_base.size)

1.0871428571428572

# Which is 95% confidence True?

In [261]:
cardio_base[cardio_base['smoke']==1].mean()

id             49629.119630
age            19090.836927
gender             1.868212
height           169.323877
weight            77.344351
ap_hi            128.360512
ap_lo             99.774518
cholesterol        1.389528
smoke              1.000000
age_years         52.296645
dtype: float64

In [262]:
cardio_base[cardio_base['smoke']==0].mean()

id             50005.598440
age            19505.400730
gender             1.299447
height           163.879416
weight            73.902352
ap_hi            128.861431
ap_lo             96.326550
cholesterol        1.364682
smoke              0.000000
age_years         53.439395
dtype: float64

In [264]:
cardio_base[cardio_base['gender']==2].mean()

id             50110.246547
age            19392.097875
gender             2.000000
height           169.947895
weight            77.257307
ap_hi            130.078872
ap_lo            100.551982
cholesterol        1.333633
smoke              0.218880
age_years         53.126277
dtype: float64

In [263]:
cardio_base[cardio_base['gender']==1].mean()

id             49898.345267
age            19510.124577
gender             1.000000
height           161.355612
weight            72.565605
ap_hi            128.139249
ap_lo             94.522776
cholesterol        1.384735
smoke              0.017856
age_years         53.452844
dtype: float64

In [266]:
cardio_base[cardio_base['age_years']>=50].mean()

id             50029.452342
age            20672.047511
gender             1.336910
height           164.016566
weight            74.570140
ap_hi            130.212113
ap_lo             98.834950
cholesterol        1.415602
smoke              0.080584
age_years         56.640631
dtype: float64

In [267]:
cardio_base[cardio_base['age_years']<50].mean()

id             49816.519868
age            16179.929596
gender             1.384180
height           165.295909
weight            73.209455
ap_hi            125.004480
ap_lo             90.604246
cholesterol        1.233666
smoke              0.108752
age_years         44.312710
dtype: float64