In [21]:
# Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Import data and display
country = pd.read_csv('country_complete.csv')
country

Unnamed: 0,Country,Continent,Years,Internet access,Emissions range,Fertility,Emissions,Internet
0,Afghanistan,Asia,3.8,Low,Low,4.33,0.254,16.8
1,Albania,Europe,10.0,Moderate,Low,1.71,1.590,65.4
2,Algeria,Africa,8.0,Low,Moderate,2.64,3.690,49.0
3,Angola,Africa,5.1,Low,Low,5.55,1.120,29.0
4,Argentina,Americas,9.9,High,Moderate,2.26,4.410,77.7
...,...,...,...,...,...,...,...,...
146,Uruguay,Americas,8.7,High,Moderate,1.97,2.010,80.7
147,Uzbekistan,Asia,11.5,Moderate,Moderate,2.23,2.810,55.2
148,Vietnam,Asia,8.2,Moderate,Moderate,1.95,2.160,69.8
149,Zambia,Africa,7.0,Low,Low,4.87,0.302,14.3


In [22]:
country.describe()

Unnamed: 0,Years,Fertility,Emissions,Internet
count,151.0,151.0,151.0,151.0
mean,8.616556,2.63755,4.65639,58.23245
std,3.165426,1.230491,5.625596,27.190164
min,1.5,1.24,0.0467,4.1
25%,6.25,1.72,0.7945,34.45
50%,9.1,2.12,2.64,64.1
75%,11.3,3.325,6.425,80.9
max,14.2,5.88,38.0,99.7


In [23]:
print(country['Internet access'].unique())
country.sort_values(by='Internet access')

['Low' 'Moderate' 'High' 'Very high']


Unnamed: 0,Country,Continent,Years,Internet access,Emissions range,Fertility,Emissions,Internet
37,Cyprus,Asia,12.1,High,High,1.34,6.30,84.4
52,Germany,Europe,14.1,High,High,1.48,9.14,87.0
81,Lithuania,Europe,13.0,High,Moderate,1.67,4.84,79.7
30,Chile,Americas,10.3,High,Moderate,1.76,4.59,84.9
116,Russia,Europe,12.0,High,Very high,1.76,11.70,80.9
...,...,...,...,...,...,...,...,...
127,South Korea,Asia,12.1,Very high,Very high,1.33,12.90,96.0
114,Qatar,Asia,9.8,Very high,Very high,1.87,38.00,99.7
62,Iceland,Europe,12.4,Very high,Very high,1.91,10.80,99.0
104,Norway,Europe,14.2,Very high,High,1.83,8.31,96.5


In [24]:
country1 = country.copy()

### <span style = "color:darkred"> groupby() operations </span>

In [26]:
# Display the number of countries in the dataset
# for each continent

country.groupby(by=["Continent"])[['Years','Fertility','Emissions', 'Internet']].mean()
#country.groupby(by=["Continent"]).mean()

Unnamed: 0_level_0,Years,Fertility,Emissions,Internet
Continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Africa,5.408889,4.090889,1.127953,31.057778
Americas,9.026923,2.161154,3.309308,62.411538
Asia,8.8675,2.244,7.43135,63.8825
Europe,11.783333,1.621944,6.716944,81.738889
Oceania,11.025,2.46,6.81275,68.725


In [27]:
# Display the number of countries in the dataset
# for each continent

country.groupby(by=["Continent"]).count()

Unnamed: 0_level_0,Country,Years,Internet access,Emissions range,Fertility,Emissions,Internet
Continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Africa,45,45,45,45,45,45,45
Americas,26,26,26,26,26,26,26
Asia,40,40,40,40,40,40,40
Europe,36,36,36,36,36,36,36
Oceania,4,4,4,4,4,4,4


In [28]:
country.groupby('Internet access')[['Internet']].mean()

Unnamed: 0_level_0,Internet
Internet access,Unnamed: 1_level_1
High,80.202128
Low,27.584211
Moderate,61.948387
Very high,95.68125


### Change the datatype of the "Internet access" column from **string** to **category**.

In [30]:
print(country['Internet access'].dtype)
#convert the 'Internet access' column to a categorical data type, which is more memory efficient.
#as only unique values are stored.
country['Internet access'] = country['Internet access'].astype('category')
print(country['Internet access'].dtype)

object
category


In [31]:
country.head()

Unnamed: 0,Country,Continent,Years,Internet access,Emissions range,Fertility,Emissions,Internet
0,Afghanistan,Asia,3.8,Low,Low,4.33,0.254,16.8
1,Albania,Europe,10.0,Moderate,Low,1.71,1.59,65.4
2,Algeria,Africa,8.0,Low,Moderate,2.64,3.69,49.0
3,Angola,Africa,5.1,Low,Low,5.55,1.12,29.0
4,Argentina,Americas,9.9,High,Moderate,2.26,4.41,77.7


### <span style = "color:darkred">pivot_table() </span>
rows grouped by 'continent' and columns grouped by 'Internet access'. 

In [33]:
# Categorical features are sorted in alphabetical order by default
# np.size counts the number of entries
country.pivot_table(
    values='Country', index='Continent', columns='Internet access', aggfunc='count' 
).round(2)


  country.pivot_table(


Internet access,High,Low,Moderate,Very high
Continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Africa,1,36,8,0
Americas,8,7,10,1
Asia,10,13,9,8
Europe,26,0,3,7
Oceania,2,1,1,0


### Research question: 
**Does internet accessibility impact the average years of schooling differently across continents?**

Since this research question involves a comparison across multiple continents and that across different internet access levels, we can create a pivot table that group rows by `Continent` and columns by `Internet access`

In [35]:
# mean years of schooling for each continent and internet access category
country.pivot_table(
    values='Years', index='Continent', columns='Internet access', aggfunc='mean'  
).round(2)


  country.pivot_table(


Internet access,High,Low,Moderate,Very high
Continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Africa,9.5,4.84,7.48,
Americas,9.74,7.33,9.22,13.3
Asia,10.53,6.35,9.57,10.1
Europe,11.59,,11.03,12.83
Oceania,12.7,7.9,10.8,


#### <span style = "color:red">Note that the order of the "Internet access" categories are sorted not by meaning, but alphabetically. </span>

We can use `cat.reorder_categories` to improve the output.

In [37]:
# cat.reorder_categories is useful for rearranging the order
# (ex: low to high)
country['Internet access'] = country['Internet access'].cat.reorder_categories(
    ['Low', 'Moderate', 'High', 'Very high']
)
# Display the number of countries in a pivot table of continent and
# internet access
country.pivot_table(
    values='Years', index='Continent', columns='Internet access', aggfunc='mean'
)

  country.pivot_table(


Internet access,Low,Moderate,High,Very high
Continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Africa,4.836111,7.475,9.5,
Americas,7.328571,9.22,9.7375,13.3
Asia,6.346154,9.566667,10.53,10.1
Europe,,11.033333,11.588462,12.828571
Oceania,7.9,10.8,12.7,


### `cat.reorder_categories` is useful for rearranging the order of a column

In [39]:
# cat.reorder_categories is useful for rearranging the order
# (ex: low to high)
country['Internet access'] = country['Internet access'].cat.reorder_categories(
    ['Low', 'Moderate', 'High', 'Very high']
)
country.groupby('Internet access')[['Internet']].mean()

  country.groupby('Internet access')[['Internet']].mean()


Unnamed: 0_level_0,Internet
Internet access,Unnamed: 1_level_1
Low,27.584211
Moderate,61.948387
High,80.202128
Very high,95.68125


### What Changes After Reordering?
Before reordering, categories are stored in the default order (usually alphabetical).

After reordering, categories follow the new custom order specified (['Low', 'Moderate', 'High', 'Very high']).
This affects operations like:
1) Sorting (sort_values) → Now sorts 'Internet access' in the new specified order instead of alphabetical.
2) Pivot Tables & Groupby → Groups will appear in the new order.
3) Plots (sns.barplot(), df.plot()) → Bars will follow the new category order.

In [41]:
country.head()

Unnamed: 0,Country,Continent,Years,Internet access,Emissions range,Fertility,Emissions,Internet
0,Afghanistan,Asia,3.8,Low,Low,4.33,0.254,16.8
1,Albania,Europe,10.0,Moderate,Low,1.71,1.59,65.4
2,Algeria,Africa,8.0,Low,Moderate,2.64,3.69,49.0
3,Angola,Africa,5.1,Low,Low,5.55,1.12,29.0
4,Argentina,Americas,9.9,High,Moderate,2.26,4.41,77.7


In [42]:
#1 
print(country['Internet access'].unique())
country.sort_values(by='Internet access')

['Low', 'Moderate', 'High', 'Very high']
Categories (4, object): ['Low', 'Moderate', 'High', 'Very high']


Unnamed: 0,Country,Continent,Years,Internet access,Emissions range,Fertility,Emissions,Internet
0,Afghanistan,Asia,3.8,Low,Low,4.33,0.2540,16.8
84,Malawi,Africa,4.5,Low,Low,4.44,0.0762,8.0
83,Madagascar,Africa,6.1,Low,Low,4.08,0.1630,15.0
80,Liberia,Africa,4.7,Low,Low,4.45,0.3240,18.9
79,Lesotho,Africa,6.3,Low,Low,3.00,1.2600,40.8
...,...,...,...,...,...,...,...,...
82,Luxembourg,Europe,12.1,Very high,Very high,1.59,15.9000,97.1
26,Canada,Americas,13.3,Very high,Very high,1.56,15.3000,94.6
127,South Korea,Asia,12.1,Very high,Very high,1.33,12.9000,96.0
114,Qatar,Asia,9.8,Very high,Very high,1.87,38.0000,99.7


In [43]:
#2
# cat.reorder_categories is useful for rearranging the order
# (ex: low to high)
country['Internet access'] = country['Internet access'].cat.reorder_categories(
    ['Low', 'Moderate', 'High', 'Very high']
)
# Display the number of countries in a pivot table of continent and
# internet access
country.pivot_table(
    values='Years', index='Continent', columns='Internet access', aggfunc='mean'
)

  country.pivot_table(


Internet access,Low,Moderate,High,Very high
Continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Africa,4.836111,7.475,9.5,
Americas,7.328571,9.22,9.7375,13.3
Asia,6.346154,9.566667,10.53,10.1
Europe,,11.033333,11.588462,12.828571
Oceania,7.9,10.8,12.7,


## <span style="color:orangered"> Practice </span>
- Create a pivot table that shows the mean fertility values for each combination of `Continent` and `Internet access` level. Set margins to show the row and column mean values.
- What insights can be obtained from the results?
 