In [1]:
import pandas as pd

In [14]:
data = pd.read_csv("../data/footprint.CSV", encoding='latin-1')
data.head()

Unnamed: 0,Country,Data Quality,SDGi,Life Exectancy,HDI,Per Capita GDP,Region,Income Group,Population (millions),Number of Earths required
0,Afghanistan,3A,52.5,62,0.48,,Middle East/Central Asia,LI,40.8,0.5
1,Albania,3A,71.6,76,0.8,"$14,889",Other Europe,UM,2.9,1.4
2,Algeria,3A,71.5,76,0.75,"$11,137",Africa,UM,45.4,1.5
3,Angola,3A,50.9,62,0.59,"$6,304",Africa,LM,35.0,0.6
4,Antigua and Barbuda,2B,,78,0.79,"$18,749",Central America/Caribbean,HI,0.1,1.9


In [15]:
data.columns

Index(['Country', 'Data Quality', 'SDGi', 'Life Exectancy', 'HDI',
       'Per Capita GDP', 'Region', 'Income Group', 'Population (millions)',
       'Number of Earths required'],
      dtype='object')

# Remove $

In [16]:
# Remove the dollar signs and convert the column to float
# Remove the dollar signs, commas, and spaces, and convert to numeric
data["Per Capita GDP"] = data["Per Capita GDP"].str.replace('$', '', regex=False) \
                         .str.replace(',', '', regex=False) \
                         .str.strip()

In [17]:
data[:2]

Unnamed: 0,Country,Data Quality,SDGi,Life Exectancy,HDI,Per Capita GDP,Region,Income Group,Population (millions),Number of Earths required
0,Afghanistan,3A,52.5,62,0.48,,Middle East/Central Asia,LI,40.8,0.5
1,Albania,3A,71.6,76,0.8,14889.0,Other Europe,UM,2.9,1.4


# Data types

In [18]:
data.dtypes

Country                       object
Data Quality                  object
SDGi                          object
Life Exectancy                object
HDI                           object
Per Capita GDP                object
Region                        object
Income Group                  object
Population (millions)         object
Number of Earths required    float64
dtype: object

In [19]:
data[["SDGi", "Life Exectancy", "HDI", "Per Capita GDP", "Population (millions)", "Number of Earths required" ]] = data[["SDGi", "Life Exectancy", "HDI", "Per Capita GDP", "Population (millions)", "Number of Earths required" ]].apply(pd.to_numeric, errors='coerce')

In [20]:
data[["Data Quality", "Region", "Income Group"]] = data[["Data Quality", "Region", "Income Group"]].astype('category')
data[["Country"]] = data[["Country"]].astype('str')

In [21]:
data.dtypes

Country                        object
Data Quality                 category
SDGi                          float64
Life Exectancy                float64
HDI                           float64
Per Capita GDP                float64
Region                       category
Income Group                 category
Population (millions)         float64
Number of Earths required     float64
dtype: object

# Save csv

In [22]:
data.to_csv("footprint_clean.csv", encoding='utf-8')

In [23]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 182 entries, 0 to 181
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype   
---  ------                     --------------  -----   
 0   Country                    182 non-null    object  
 1   Data Quality               182 non-null    category
 2   SDGi                       158 non-null    float64 
 3   Life Exectancy             175 non-null    float64 
 4   HDI                        171 non-null    float64 
 5   Per Capita GDP             163 non-null    float64 
 6   Region                     182 non-null    category
 7   Income Group               178 non-null    category
 8   Population (millions)      180 non-null    float64 
 9   Number of Earths required  181 non-null    float64 
dtypes: category(3), float64(6), object(1)
memory usage: 11.5+ KB


# Regions ?

In [24]:
data.loc[(data['Region'] == 'Europe')]
data.Region.unique()

['Middle East/Central Asia', 'Other Europe', 'Africa', 'Central America/Caribbean', 'South America', 'Asia-Pacific', 'EU-27', 'North America']
Categories (8, object): ['Africa', 'Asia-Pacific', 'Central America/Caribbean', 'EU-27', 'Middle East/Central Asia', 'North America', 'Other Europe', 'South America']

In [25]:
data.groupby('Region', observed=False).size()

Region
Africa                       53
Asia-Pacific                 30
Central America/Caribbean    20
EU-27                        26
Middle East/Central Asia     23
North America                 4
Other Europe                 13
South America                13
dtype: int64

# Countries with missing data

Unnamed: 0,Country,Data Quality,SDGi,Life Exectancy,HDI,Per Capita GDP,Region,Income Group,Population (millions),Number of Earths required
89,"Korea, Democratic People's Republic of",3A,,73.0,,,Asia-Pacific,LI,26.0,0.8


In [53]:
data.loc[data.isnull().any(axis=1)]
data.loc[data.Country == "Korea, Democratic People's Republic of"]
list(data.loc[(data['HDI'].isnull())]['Country'])
data.loc[(data['HDI'].isnull())][['Country', 'Number of Earths required']].sort_values('Number of Earths required')

Unnamed: 0,Country,Number of Earths required
40,Côte d'Ivoire,0.6
150,Somalia,0.6
181,Zimbabwe,0.7
89,"Korea, Democratic People's Republic of",0.8
180,Zambia,0.8
60,French Guiana,1.0
105,Martinique,2.3
69,Guadeloupe,2.3
134,Réunion,2.3
61,French Polynesia,2.5


In [56]:
# How many are just sustainable
data.loc[(data['Number of Earths required'] <= 1)]["Population (millions)"].sum()

# Just sustainable people as %

# # of countries that are sustainable
data.loc[(data['Number of Earths required']<=1)].count()
data.loc[(data['Number of Earths required'] <= 1)]["Population (millions)"].sum() / data['Population (millions)'].sum()

np.float64(0.40393569183015626)

# Who's sustainable and developed

In [149]:
# suatainable and developed countries
data.loc[(data['HDI'] > 0.7) & (data['Number of Earths required'] <= 1)]

# In population  numbers
data.loc[(data['HDI'] > 0.7) & (data['Number of Earths required'] <= 1)]['Population (millions)'].sum()

# Suatainable and developed people as percentage
data.loc[(data['HDI'] > 0.7) & (data['Number of Earths required'] <= 1)]['Population (millions)'].sum() / data['Population (millions)'].sum()

np.float64(0.026363961956305457)

In [153]:
# Unsustainable world population %
data.loc[(data['Number of Earths required'] > 1)]["Population (millions)"].sum() / data['Population (millions)'].sum()

np.float64(0.5960444108400651)

In [57]:
# Sustainable countries sorted by HDI
data.loc[(data['Number of Earths required'] <= 1)].sort_values("HDI", ascending=False)[:5]

Unnamed: 0,Country,Data Quality,SDGi,Life Exectancy,HDI,Per Capita GDP,Region,Income Group,Population (millions),Number of Earths required
132,Republic of Moldova,2B,73.9,69.0,0.77,14320.0,Other Europe,LM,4.0,1.0
50,Egypt,2B,68.7,70.0,0.73,12786.0,Africa,LM,106.2,1.0
166,Tunisia,2A,70.7,74.0,0.73,10535.0,Africa,UM,12.0,0.9
86,Jordan,2A,69.4,74.0,0.72,10159.0,Middle East/Central Asia,UM,10.3,0.9
128,Philippines,3A,66.6,69.0,0.7,8768.0,Asia-Pacific,LM,112.5,0.8


In [159]:
data.loc[(data['Number of Earths required'] <= 1)].groupby('Region', observed=False).size()

Region
Africa                       39
Asia-Pacific                 10
Central America/Caribbean     3
EU-27                         0
Middle East/Central Asia      6
North America                 0
Other Europe                  1
South America                 1
dtype: int64

In [62]:
data.loc[(data['Number of Earths required'] <= 1)].groupby('Income Group', observed=False).size()

Income Group
HI     0
LI    34
LM    23
UM     2
dtype: int64

In [55]:
data.loc[(data['HDI'] > 0.7) & (data['Number of Earths required'] <= 1)]['Country']

50                   Egypt
86                  Jordan
132    Republic of Moldova
166                Tunisia
Name: Country, dtype: object