In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [2]:
url = "https://www.alessandrobramucci.com/gapminder.csv"

df = pd.read_csv(url)

df.to_csv("data/gapminder.csv", sep = ";", decimal = ".")

In [4]:
df.tail()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
1699,Zimbabwe,Africa,1987,62.351,9216418,706.157306
1700,Zimbabwe,Africa,1992,60.377,10704340,693.420786
1701,Zimbabwe,Africa,1997,46.809,11404948,792.44996
1702,Zimbabwe,Africa,2002,39.989,11926563,672.038623
1703,Zimbabwe,Africa,2007,43.487,12311143,469.709298


In [5]:
df.dtypes

country       object
continent     object
year           int64
lifeExp      float64
pop            int64
gdpPercap    float64
dtype: object

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   country    1704 non-null   object 
 1   continent  1704 non-null   object 
 2   year       1704 non-null   int64  
 3   lifeExp    1704 non-null   float64
 4   pop        1704 non-null   int64  
 5   gdpPercap  1704 non-null   float64
dtypes: float64(2), int64(2), object(2)
memory usage: 80.0+ KB


In [7]:
df.isna().sum()

country      0
continent    0
year         0
lifeExp      0
pop          0
gdpPercap    0
dtype: int64

In [8]:
df.shape

(1704, 6)

In [14]:
#Liste der Länder
country_list = df['country'].unique()
country_list


array(['Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina',
       'Australia', 'Austria', 'Bahrain', 'Bangladesh', 'Belgium',
       'Benin', 'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil',
       'Bulgaria', 'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon',
       'Canada', 'Central African Republic', 'Chad', 'Chile', 'China',
       'Colombia', 'Comoros', 'Congo, Dem. Rep.', 'Congo, Rep.',
       'Costa Rica', "Cote d'Ivoire", 'Croatia', 'Cuba', 'Czech Republic',
       'Denmark', 'Djibouti', 'Dominican Republic', 'Ecuador', 'Egypt',
       'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Ethiopia',
       'Finland', 'France', 'Gabon', 'Gambia', 'Germany', 'Ghana',
       'Greece', 'Guatemala', 'Guinea', 'Guinea-Bissau', 'Haiti',
       'Honduras', 'Hong Kong, China', 'Hungary', 'Iceland', 'India',
       'Indonesia', 'Iran', 'Iraq', 'Ireland', 'Israel', 'Italy',
       'Jamaica', 'Japan', 'Jordan', 'Kenya', 'Korea, Dem. Rep.',
       'Korea, Rep.', 'Kuwait', 'Leba

In [15]:
len(country_list)

142

In [11]:
#Wie viele Länder
country_count = df['country'].nunique()
country_count

142

In [17]:
#Wie viele Jahre pro Land
years_count = df.groupby('country')['year'].count()
years_count

country
Afghanistan           12
Albania               12
Algeria               12
Angola                12
Argentina             12
                      ..
Vietnam               12
West Bank and Gaza    12
Yemen, Rep.           12
Zambia                12
Zimbabwe              12
Name: year, Length: 142, dtype: int64

In [18]:
sum(years_count == 12)


142

In [26]:
#Wie viele Länder pro Kontinent (nur 2007)
countries_bycont_2007 = df[df.year == 2007].groupby(['continent'])['country'].count()
countries_bycont_2007

continent
Africa      52
Americas    25
Asia        33
Europe      30
Oceania      2
Name: country, dtype: int64

In [27]:
#Germany filter
df_germany = df[df.country == "Germany"].select_dtypes(include='number')

df_germany.to_csv("data/germany_gapminder.csv", 
                  sep = ",", 
                  decimal = ".", 
                  index = False)

In [28]:
df_germany.to_csv('data/Germany_gapminder.csv', sep = ',', decimal = ',')

In [31]:
#Durchschnittswerte pro Land über die Zeit
df_mean = df.groupby('country')[['lifeExp','pop','gdpPercap']].mean()
df_mean

Unnamed: 0_level_0,lifeExp,pop,gdpPercap
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan,37.478833,1.582372e+07,802.674598
Albania,68.432917,2.580249e+06,3255.366633
Algeria,59.030167,1.987541e+07,4426.025973
Angola,37.883500,7.309390e+06,3607.100529
Argentina,69.060417,2.860224e+07,8955.553783
...,...,...,...
Vietnam,57.479500,5.456857e+07,1017.712615
West Bank and Gaza,60.328667,1.848606e+06,3759.996781
"Yemen, Rep.",46.780417,1.084319e+07,1569.274672
Zambia,45.996333,6.353805e+06,1358.199409


In [36]:
df_mean = df.groupby('continent')[['lifeExp', 'pop', 'gdpPercap']].mean()
df_mean['Variable'] = "Mean"
df_mean


Unnamed: 0_level_0,lifeExp,pop,gdpPercap,Variable
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Africa,48.86533,9916003.0,2193.754578,Mean
Americas,64.658737,24504790.0,7136.110356,Mean
Asia,60.064903,77038720.0,7902.150428,Mean
Europe,71.903686,17169760.0,14469.475533,Mean
Oceania,74.326208,8874672.0,18621.609223,Mean


In [37]:
df_median = df.groupby('continent')[['lifeExp', 'pop', 'gdpPercap']].median()
df_median['Variable'] = "Median"
df_median

Unnamed: 0_level_0,lifeExp,pop,gdpPercap,Variable
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Africa,47.792,4579311.0,1192.138217,Median
Americas,67.048,6227510.0,5465.509853,Median
Asia,61.7915,14530830.5,2646.786844,Median
Europe,72.241,8551125.0,12081.749115,Median
Oceania,73.665,6403491.5,17983.303955,Median


In [38]:
df_min = df.groupby('continent')[['lifeExp', 'pop', 'gdpPercap']].min()
df_min['Variable'] = "Min"
df_min

Unnamed: 0_level_0,lifeExp,pop,gdpPercap,Variable
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Africa,23.599,60011,241.165876,Min
Americas,37.579,662850,1201.637154,Min
Asia,28.801,120447,331.0,Min
Europe,43.585,147962,973.533195,Min
Oceania,69.12,1994794,10039.59564,Min


In [39]:
df_max = df.groupby('continent')[['lifeExp', 'pop', 'gdpPercap']].max()
df_max['Variable'] = "Max"
df_max

Unnamed: 0_level_0,lifeExp,pop,gdpPercap,Variable
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Africa,76.442,135031164,21951.21176,Max
Americas,80.653,301139947,42951.65309,Max
Asia,82.603,1318683096,113523.1329,Max
Europe,81.757,82400996,49357.19017,Max
Oceania,81.235,20434176,34435.36744,Max


In [42]:
df_summary = pd.concat([df_mean, df_median, df_min, df_max], axis=0).sort_index().reset_index()
df_summary

Unnamed: 0,continent,lifeExp,pop,gdpPercap,Variable
0,Africa,48.86533,9916003.0,2193.754578,Mean
1,Africa,47.792,4579311.0,1192.138217,Median
2,Africa,23.599,60011.0,241.165876,Min
3,Africa,76.442,135031200.0,21951.21176,Max
4,Americas,64.658737,24504790.0,7136.110356,Mean
5,Americas,67.048,6227510.0,5465.509853,Median
6,Americas,37.579,662850.0,1201.637154,Min
7,Americas,80.653,301139900.0,42951.65309,Max
8,Asia,60.064903,77038720.0,7902.150428,Mean
9,Asia,82.603,1318683000.0,113523.1329,Max
