## Data Science Pipline Tutorial 

In [41]:
import pandas as pd 
import seaborn 
import matplotlib.pyplot as plt 
import numpy as np 
import statsmodels.api as sm 
import sklearn.ensemble
import sklearn.model_selection
import sklearn.metrics
import sklearn.linear_model 


In [42]:
#data scraping 

data = pd.read_csv('owid-covid-data.csv')
data.head() 

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index
0,AFG,Asia,Afghanistan,2020-02-24,1.0,1.0,,,,,...,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511
1,AFG,Asia,Afghanistan,2020-02-25,1.0,0.0,,,,,...,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511
2,AFG,Asia,Afghanistan,2020-02-26,1.0,0.0,,,,,...,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511
3,AFG,Asia,Afghanistan,2020-02-27,1.0,0.0,,,,,...,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511
4,AFG,Asia,Afghanistan,2020-02-28,1.0,0.0,,,,,...,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511


In [44]:

# deleting references to world data (wrl)
data = data[data.iso_code != 'OWID_WRL']

# # deleting references to Africa  (afr)
data = data[data.iso_code != 'OWID_AFR']

# # deleting references to Asia (asi)
data = data[data.iso_code != 'OWID_ASI']

# # deleting references to Europe and EU (EUR and EUN)
data = data[data.iso_code != 'OWID_EUR']
data = data[data.iso_code != 'OWID_EUN']

# # deleting references to North America (nam)
data = data[data.iso_code != 'OWID_NAM']

# # deleting references to Oceania (oce)
data = data[data.iso_code != 'OWID_OCE']

# # deleting references to South America (sam)  
data = data[data.iso_code != 'OWID_SAM']

data = data.sort_values(by=['population'] , ascending = False) 


# getting the data for the latest date available (5/3/2021)
may_03 = data[data.date == '2021-05-03']
may_03 = may_03.head(50)
may_03 


Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index
16713,CHN,Asia,China,2021-05-03,102549.0,19.0,21.429,4846.0,0.0,0.143,...,15308.712,0.7,261.899,9.74,1.9,48.4,,4.34,76.91,0.761
35734,IND,Asia,India,2021-05-03,20282833.0,357316.0,378092.429,222408.0,3449.0,3502.0,...,6426.674,21.2,282.28,10.39,1.9,20.6,59.55,0.53,69.66,0.645
81797,USA,North America,United States,2021-05-03,32472201.0,50560.0,49618.857,577566.0,483.0,670.429,...,54225.446,1.2,151.089,10.79,19.1,24.6,,2.77,78.86,0.926
36162,IDN,Asia,Indonesia,2021-05-03,1682004.0,4730.0,4980.857,45949.0,153.0,168.286,...,11188.744,5.7,342.864,6.32,2.8,76.1,64.204,1.04,71.72,0.718
59235,PAK,Asia,Pakistan,2021-05-03,837523.0,3377.0,4654.857,18310.0,161.0,140.143,...,5034.708,4.0,423.031,8.35,2.8,36.7,59.607,0.6,67.27,0.557
11434,BRA,South America,Brazil,2021-05-03,14779529.0,24619.0,58586.571,408622.0,983.0,2383.714,...,14103.452,3.4,177.961,8.11,10.1,17.9,,2.2,75.88,0.765
56466,NGA,Africa,Nigeria,2021-05-03,165199.0,46.0,63.286,2063.0,0.0,0.143,...,5338.454,,181.013,2.42,0.6,10.8,41.949,,54.69,0.539
7106,BGD,Asia,Bangladesh,2021-05-03,763682.0,1739.0,2150.571,11644.0,65.0,70.571,...,3523.984,14.8,298.003,8.38,1.0,44.7,34.808,0.8,72.59,0.632
64039,RUS,Europe,Russia,2021-05-03,4776844.0,8368.0,8503.286,109341.0,330.0,365.429,...,24765.954,0.1,431.297,6.18,23.4,58.3,,8.05,72.58,0.824
50338,MEX,North America,Mexico,2021-05-03,2349900.0,1027.0,2909.429,217345.0,112.0,318.857,...,17336.469,2.5,152.783,13.06,6.9,21.4,87.847,1.38,75.05,0.779
