### In this Notebook will answer a few questions through analysis of the data files.
Let's load the data from the CSV file into a Pandas data frame.

In [72]:
import pandas as pd
import numpy as np
countries_df=pd.read_csv('countries.csv')
countries_df.describe()

Unnamed: 0,population,life_expectancy,hospital_beds_per_thousand,gdp_per_capita
count,210.0,207.0,164.0,183.0
mean,36942760.0,73.529855,3.012671,19485.912399
std,142509200.0,7.560056,2.464917,19820.802488
min,809.0,53.28,0.1,661.24
25%,1031042.0,69.26,1.3,4504.151
50%,6909866.0,75.09,2.3585,13111.214
75%,26158680.0,78.94,3.89575,28349.9835
max,1439324000.0,86.75,13.8,116935.6


#### How many countries does the dataframe contain?

In [8]:
num_countries,m = countries_df.shape
# or
num_countries=len(countries_df)
print('There are {} countries in the dataset'.format(num_countries))

There are 210 countries in the dataset


#### Retrieve a list of continents from the dataframe

In [11]:
continents = countries_df.continent.unique()
continents

array(['Asia', 'Europe', 'Africa', 'North America', 'South America',
       'Oceania'], dtype=object)

#### What is the total population of all the countries listed in this dataset?

In [12]:
total_population = countries_df.population.sum()
print('The total population is {}.'.format(int(total_population)))

The total population is 7757980095.


#### What is the overall life expectancy across in the world?

In [15]:
countries_df['weighted']=countries_df.population*countries_df.life_expectancy
overall=countries_df.weighted.sum()/total_population
countries_df.drop(columns=['weighted'], inplace=True)
print(f'The overall life expectancy across in the world is {overall}')

The overall life expectancy across in the world is 72.72165193409664


#### Create a dataframe containing 10 countries with the highest population.

In [17]:
most_populous_df = countries_df.sort_values('population',ascending=False).location.head()
most_populous_df

41             China
90             India
199    United States
91         Indonesia
145         Pakistan
Name: location, dtype: object

#### Add a new column in `countries_df` to record the overall GDP per country (product of population & per capita GDP).

In [20]:
countries_df['overall_gdp'] = countries_df.population*countries_df.gdp_per_capita
countries_df

Unnamed: 0,location,continent,population,life_expectancy,hospital_beds_per_thousand,gdp_per_capita,gdp,overall_gdp
0,Afghanistan,Asia,38928341.0,64.83,0.50,1803.987,7.022622e+10,7.022622e+10
1,Albania,Europe,2877800.0,78.57,2.89,11803.431,3.396791e+10,3.396791e+10
2,Algeria,Africa,43851043.0,76.88,1.90,13913.839,6.101364e+11,6.101364e+11
3,Andorra,Europe,77265.0,83.73,,,,
4,Angola,Africa,32866268.0,61.15,,5819.495,1.912651e+11,1.912651e+11
...,...,...,...,...,...,...,...,...
205,Vietnam,Asia,97338583.0,75.40,2.60,6171.884,6.007624e+11,6.007624e+11
206,Western Sahara,Africa,597330.0,70.26,,,,
207,Yemen,Asia,29825968.0,66.12,0.70,1479.147,4.411699e+10,4.411699e+10
208,Zambia,Africa,18383956.0,63.89,2.00,3689.251,6.782303e+10,6.782303e+10


#### Create a dataframe containing 10 countries with the lowest GDP per capita, among the counties with population greater than 100 million.

In [30]:
pop_df=countries_df[countries_df.population>1e8]
pop_df.sort_values('gdp_per_capita', ascending=True).head(10)

Unnamed: 0,location,continent,population,life_expectancy,hospital_beds_per_thousand,gdp_per_capita,gdp,overall_gdp
63,Ethiopia,Africa,114963600.0,66.6,0.3,1729.927,198878600000.0,198878600000.0
15,Bangladesh,Asia,164689400.0,72.59,0.8,3523.984,580362800000.0,580362800000.0
145,Pakistan,Asia,220892300.0,67.27,0.6,5034.708,1112128000000.0,1112128000000.0
141,Nigeria,Africa,206139600.0,54.69,,5338.454,1100467000000.0,1100467000000.0
90,India,Asia,1380004000.0,69.66,0.53,6426.674,8868838000000.0,8868838000000.0
151,Philippines,Asia,109581100.0,71.23,1.0,7599.188,832727300000.0,832727300000.0
58,Egypt,Africa,102334400.0,71.99,1.6,10550.206,1079649000000.0,1079649000000.0
91,Indonesia,Asia,273523600.0,71.72,1.04,11188.744,3060386000000.0,3060386000000.0
27,Brazil,South America,212559400.0,75.88,2.2,14103.452,2997821000000.0,2997821000000.0
41,China,Asia,1439324000.0,76.91,4.34,15308.712,22034190000000.0,22034190000000.0


#### Create a data frame that counts the number countries in each continent?

In [35]:
country_counts_df = countries_df.groupby('continent').location.count()
country_counts_df

continent
Africa           55
Asia             47
Europe           51
North America    36
Oceania           8
South America    13
Name: location, dtype: int64

#### Create a data frame showing the total population of each continent.

In [36]:
continent_populations_df = countries_df.groupby('continent').population.sum()
continent_populations_df

continent
Africa           1.339424e+09
Asia             4.607388e+09
Europe           7.485062e+08
North America    5.912425e+08
Oceania          4.095832e+07
South America    4.304611e+08
Name: population, dtype: float64

In [41]:
covid_data_df=pd.read_csv('covid-countries-data.csv')
covid_data_df

Unnamed: 0,location,total_cases,total_deaths,total_tests
0,Afghanistan,38243.0,1409.0,
1,Albania,9728.0,296.0,
2,Algeria,45158.0,1525.0,
3,Andorra,1199.0,53.0,
4,Angola,2729.0,109.0,
...,...,...,...,...
207,Western Sahara,766.0,1.0,
208,World,26059065.0,863535.0,
209,Yemen,1976.0,571.0,
210,Zambia,12415.0,292.0,


#### Count the number of countries for which the `total_tests` data is missing.

In [43]:
total_tests_missing = covid_data_df.total_tests.isna()Merge `countries_df` with `covid_data_df` on the `location` column..sum()
print("The data for total tests is missing for {} countries.".format(int(total_tests_missing)))

The data for total tests is missing for 122 countries.


#### Merge `countries_df` with `covid_data_df` on the `location` column.

In [44]:
combined_df = covid_data_df.merge(countries_df,on='location')
combined_df

Unnamed: 0,location,total_cases,total_deaths,total_tests,continent,population,life_expectancy,hospital_beds_per_thousand,gdp_per_capita,gdp,overall_gdp
0,Afghanistan,38243.0,1409.0,,Asia,38928341.0,64.83,0.50,1803.987,7.022622e+10,7.022622e+10
1,Albania,9728.0,296.0,,Europe,2877800.0,78.57,2.89,11803.431,3.396791e+10,3.396791e+10
2,Algeria,45158.0,1525.0,,Africa,43851043.0,76.88,1.90,13913.839,6.101364e+11,6.101364e+11
3,Andorra,1199.0,53.0,,Europe,77265.0,83.73,,,,
4,Angola,2729.0,109.0,,Africa,32866268.0,61.15,,5819.495,1.912651e+11,1.912651e+11
...,...,...,...,...,...,...,...,...,...,...,...
205,Vietnam,1046.0,35.0,261004.0,Asia,97338583.0,75.40,2.60,6171.884,6.007624e+11,6.007624e+11
206,Western Sahara,766.0,1.0,,Africa,597330.0,70.26,,,,
207,Yemen,1976.0,571.0,,Asia,29825968.0,66.12,0.70,1479.147,4.411699e+10,4.411699e+10
208,Zambia,12415.0,292.0,,Africa,18383956.0,63.89,2.00,3689.251,6.782303e+10,6.782303e+10


#### Add columns `tests_per_million`, `cases_per_million` and `deaths_per_million` into `combined_df`.

In [45]:
combined_df['tests_per_million'] = combined_df['total_tests'] * 1e6 / combined_df['population']
combined_df['cases_per_million'] = combined_df['total_cases'] * 1e6 / combined_df['population']
combined_df['deaths_per_million'] = combined_df['total_deaths'] * 1e6 / combined_df['population']
combined_df

Unnamed: 0,location,total_cases,total_deaths,total_tests,continent,population,life_expectancy,hospital_beds_per_thousand,gdp_per_capita,gdp,overall_gdp,tests_per_million,cases_per_million,deaths_per_million
0,Afghanistan,38243.0,1409.0,,Asia,38928341.0,64.83,0.50,1803.987,7.022622e+10,7.022622e+10,,982.394806,36.194710
1,Albania,9728.0,296.0,,Europe,2877800.0,78.57,2.89,11803.431,3.396791e+10,3.396791e+10,,3380.359997,102.856349
2,Algeria,45158.0,1525.0,,Africa,43851043.0,76.88,1.90,13913.839,6.101364e+11,6.101364e+11,,1029.804468,34.776824
3,Andorra,1199.0,53.0,,Europe,77265.0,83.73,,,,,,15518.022390,685.950948
4,Angola,2729.0,109.0,,Africa,32866268.0,61.15,,5819.495,1.912651e+11,1.912651e+11,,83.033462,3.316470
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205,Vietnam,1046.0,35.0,261004.0,Asia,97338583.0,75.40,2.60,6171.884,6.007624e+11,6.007624e+11,2681.403324,10.745996,0.359570
206,Western Sahara,766.0,1.0,,Africa,597330.0,70.26,,,,,,1282.373228,1.674116
207,Yemen,1976.0,571.0,,Asia,29825968.0,66.12,0.70,1479.147,4.411699e+10,4.411699e+10,,66.250993,19.144391
208,Zambia,12415.0,292.0,,Africa,18383956.0,63.89,2.00,3689.251,6.782303e+10,6.782303e+10,,675.317108,15.883415


#### Create a dataframe with 10 countires that have highest number of tests per million people.

In [68]:
highest_tests_df = combined_df.sort_values('tests_per_million',ascending=False).head(10)
highest_tests_df

Unnamed: 0,location,total_cases,total_deaths,total_tests,continent,population,life_expectancy,hospital_beds_per_thousand,gdp_per_capita,gdp,overall_gdp,tests_per_million,cases_per_million,deaths_per_million
197,United Arab Emirates,71540.0,387.0,7177430.0,Asia,9890400.0,77.97,1.2,67293.483,665559500000.0,665559500000.0,725696.635121,7233.276713,39.128852
14,Bahrain,52440.0,190.0,1118837.0,Asia,1701583.0,77.29,2.0,43290.705,73662730000.0,73662730000.0,657527.137965,30818.36149,111.66073
115,Luxembourg,7928.0,124.0,385820.0,Europe,625976.0,82.25,4.51,94277.965,59015740000.0,59015740000.0,616349.508607,12665.022301,198.090662
122,Malta,1931.0,13.0,188539.0,Europe,441539.0,82.53,4.485,36513.323,16122060000.0,16122060000.0,427004.183096,4373.339614,29.442473
53,Denmark,17195.0,626.0,2447911.0,Europe,5792203.0,80.9,2.5,46682.515,270394600000.0,270394600000.0,422621.755488,2968.645954,108.076323
96,Israel,122539.0,969.0,2353984.0,Asia,8655541.0,82.97,2.99,33132.32,286778200000.0,286778200000.0,271962.665303,14157.289533,111.951408
89,Iceland,2121.0,10.0,88829.0,Europe,341250.0,82.99,2.91,46482.958,15862310000.0,15862310000.0,260304.761905,6215.384615,29.304029
157,Russia,1005000.0,17414.0,37176827.0,Europe,145934460.0,72.58,8.05,24765.954,3614206000000.0,3614206000000.0,254750.159763,6886.653091,119.327539
199,United States,6114406.0,185744.0,83898416.0,North America,331002647.0,78.86,2.77,54225.446,17948770000000.0,17948770000000.0,253467.507769,18472.377957,561.155633
10,Australia,25923.0,663.0,6255797.0,Oceania,25499881.0,83.44,3.84,44648.71,1138537000000.0,1138537000000.0,245326.517406,1016.592979,26.000121


#### Create a dataframe with 10 countires that have highest number of positive cases per million people.

In [71]:
highest_cases_df = combined_df.nlargest(10,'cases_per_million')
highest_cases_df

Unnamed: 0,location,total_cases,total_deaths,total_tests,continent,population,life_expectancy,hospital_beds_per_thousand,gdp_per_capita,gdp,overall_gdp,tests_per_million,cases_per_million,deaths_per_million
155,Qatar,119206.0,199.0,634745.0,Asia,2881060.0,80.23,1.2,116935.6,336898500000.0,336898500000.0,220316.48074,41375.74365,69.0718
14,Bahrain,52440.0,190.0,1118837.0,Asia,1701583.0,77.29,2.0,43290.705,73662730000.0,73662730000.0,657527.137965,30818.36149,111.66073
147,Panama,94084.0,2030.0,336345.0,North America,4314768.0,78.51,2.3,22267.037,96077100000.0,96077100000.0,77952.04748,21805.112117,470.477208
40,Chile,414739.0,11344.0,2458762.0,South America,19116209.0,80.18,2.11,22767.037,435219400000.0,435219400000.0,128621.841287,21695.671982,593.4231
162,San Marino,735.0,42.0,,Europe,33938.0,84.97,3.8,56861.47,1929765000.0,1929765000.0,,21657.13949,1237.550828
9,Aruba,2211.0,12.0,,North America,106766.0,76.29,,35973.781,3840777000.0,3840777000.0,,20708.839893,112.395332
105,Kuwait,86478.0,535.0,621616.0,Asia,4270563.0,75.49,2.0,65530.537,279852300000.0,279852300000.0,145558.325682,20249.789079,125.276222
150,Peru,663437.0,29259.0,584232.0,South America,32971846.0,76.74,1.6,12236.706,403466800000.0,403466800000.0,17719.117092,20121.318048,887.393445
27,Brazil,3997865.0,123780.0,4797948.0,South America,212559409.0,75.88,2.2,14103.452,2997821000000.0,2997821000000.0,22572.268255,18808.224105,582.331314
199,United States,6114406.0,185744.0,83898416.0,North America,331002647.0,78.86,2.77,54225.446,17948770000000.0,17948770000000.0,253467.507769,18472.377957,561.155633


#### Create a dataframe with 10 countires that have highest number of deaths cases per million people?

In [67]:
highest_deaths_df = combined_df.sort_values('total_deaths',ascending=False).head(10)
print(highest_deaths_df)

           location  total_cases  total_deaths  total_tests      continent  \
199   United States    6114406.0      185744.0   83898416.0  North America   
27           Brazil    3997865.0      123780.0    4797948.0  South America   
90            India    3853406.0       67376.0   44337201.0           Asia   
125          Mexico     610957.0       65816.0    1271295.0  North America   
198  United Kingdom     338676.0       41514.0   13447568.0         Europe   
97            Italy     271515.0       35497.0    5214766.0         Europe   
68           France     293024.0       30686.0          NaN         Europe   
150            Peru     663437.0       29259.0     584232.0  South America   
177           Spain     479554.0       29194.0    6416533.0         Europe   
92             Iran     378752.0       21797.0    3256122.0           Asia   

       population  life_expectancy  hospital_beds_per_thousand  \
199  3.310026e+08            78.86                        2.77   
27   2.12

#### Count number of countries that feature in both the lists of "highest number of tests per million" and "highest number of cases per million".

In [58]:
np.intersect1d(highest_cases_df.location,highest_tests_df.location)

array(['Bahrain', 'United States'], dtype=object)