In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import plotly.express as ps 

In [2]:
url = "https://www.alessandrobramucci.com/gapminder.csv"

df = pd.read_csv(url)

df.to_csv("gapminder.csv", sep = ";", decimal = ".")

In [3]:
df.tail()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
1699,Zimbabwe,Africa,1987,62.351,9216418,706.157306
1700,Zimbabwe,Africa,1992,60.377,10704340,693.420786
1701,Zimbabwe,Africa,1997,46.809,11404948,792.44996
1702,Zimbabwe,Africa,2002,39.989,11926563,672.038623
1703,Zimbabwe,Africa,2007,43.487,12311143,469.709298


In [4]:
df.dtypes

country       object
continent     object
year           int64
lifeExp      float64
pop            int64
gdpPercap    float64
dtype: object

In [5]:
df.isna().sum()

country      0
continent    0
year         0
lifeExp      0
pop          0
gdpPercap    0
dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   country    1704 non-null   object 
 1   continent  1704 non-null   object 
 2   year       1704 non-null   int64  
 3   lifeExp    1704 non-null   float64
 4   pop        1704 non-null   int64  
 5   gdpPercap  1704 non-null   float64
dtypes: float64(2), int64(2), object(2)
memory usage: 80.0+ KB


In [7]:
df.shape

(1704, 6)

In [8]:
#Liste der Länder 
country_list = df['country'].unique()
country_list

array(['Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina',
       'Australia', 'Austria', 'Bahrain', 'Bangladesh', 'Belgium',
       'Benin', 'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil',
       'Bulgaria', 'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon',
       'Canada', 'Central African Republic', 'Chad', 'Chile', 'China',
       'Colombia', 'Comoros', 'Congo, Dem. Rep.', 'Congo, Rep.',
       'Costa Rica', "Cote d'Ivoire", 'Croatia', 'Cuba', 'Czech Republic',
       'Denmark', 'Djibouti', 'Dominican Republic', 'Ecuador', 'Egypt',
       'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Ethiopia',
       'Finland', 'France', 'Gabon', 'Gambia', 'Germany', 'Ghana',
       'Greece', 'Guatemala', 'Guinea', 'Guinea-Bissau', 'Haiti',
       'Honduras', 'Hong Kong, China', 'Hungary', 'Iceland', 'India',
       'Indonesia', 'Iran', 'Iraq', 'Ireland', 'Israel', 'Italy',
       'Jamaica', 'Japan', 'Jordan', 'Kenya', 'Korea, Dem. Rep.',
       'Korea, Rep.', 'Kuwait', 'Leba

In [9]:
#Wie viele Länder?
country_count = df['country'].nunique()
country_count

142

In [10]:
years_count = df.groupby('country')['year'].count()
years_count

country
Afghanistan           12
Albania               12
Algeria               12
Angola                12
Argentina             12
                      ..
Vietnam               12
West Bank and Gaza    12
Yemen, Rep.           12
Zambia                12
Zimbabwe              12
Name: year, Length: 142, dtype: int64

In [11]:
sum(years_count == 12)

142

In [12]:
countries_bycont_count2007 = df[df.year == 2007].groupby(['continent', 'year'])['country'].count()
countries_bycont_count2007

continent  year
Africa     2007    52
Americas   2007    25
Asia       2007    33
Europe     2007    30
Oceania    2007     2
Name: country, dtype: int64

In [13]:
df_germany = df[df.country == "Germany"].select_dtypes(include='number')

df_germany.to_csv("germany_gapminder.csv", 
                  sep = ",", 
                  decimal = ".", 
                  index = False)

In [14]:
df.groupby('country')[['lifeExp', 'pop', 'gdpPercap']].mean()

Unnamed: 0_level_0,lifeExp,pop,gdpPercap
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan,37.478833,1.582372e+07,802.674598
Albania,68.432917,2.580249e+06,3255.366633
Algeria,59.030167,1.987541e+07,4426.025973
Angola,37.883500,7.309390e+06,3607.100529
Argentina,69.060417,2.860224e+07,8955.553783
...,...,...,...
Vietnam,57.479500,5.456857e+07,1017.712615
West Bank and Gaza,60.328667,1.848606e+06,3759.996781
"Yemen, Rep.",46.780417,1.084319e+07,1569.274672
Zambia,45.996333,6.353805e+06,1358.199409


In [15]:
df_mean = df.groupby('continent')[['lifeExp', 'pop', 'gdpPercap']].mean()
df_mean['Variable'] = "Mean"
df_median = df.groupby('continent')[['lifeExp', 'pop', 'gdpPercap']].median()
df_median['Variable'] = "Median"
df_min = df.groupby('continent')[['lifeExp', 'pop', 'gdpPercap']].min()
df_min['Variable'] = "Min"
df_max = df.groupby('continent')[['lifeExp', 'pop', 'gdpPercap']].max()
df_max['Variable'] = "Max"

In [16]:
df_summary = pd.concat([df_mean, df_median, df_min, df_max], axis=0).sort_index().reset_index()


In [17]:
df_summary

Unnamed: 0,continent,lifeExp,pop,gdpPercap,Variable
0,Africa,48.86533,9916003.0,2193.754578,Mean
1,Africa,47.792,4579311.0,1192.138217,Median
2,Africa,23.599,60011.0,241.165876,Min
3,Africa,76.442,135031200.0,21951.21176,Max
4,Americas,64.658737,24504790.0,7136.110356,Mean
5,Americas,67.048,6227510.0,5465.509853,Median
6,Americas,37.579,662850.0,1201.637154,Min
7,Americas,80.653,301139900.0,42951.65309,Max
8,Asia,60.064903,77038720.0,7902.150428,Mean
9,Asia,82.603,1318683000.0,113523.1329,Max


In [18]:
df_p1 = pd.DataFrame({
    'Minimum': df.groupby('year')['lifeExp'].min(),
    'Maximum': df.groupby('year')['lifeExp'].max(),
    'Median': df.groupby('year')['lifeExp'].median()
}).reset_index()

In [19]:
df_p1

Unnamed: 0,year,Minimum,Maximum,Median
0,1952,28.801,72.67,45.1355
1,1957,30.332,73.47,48.3605
2,1962,31.997,73.68,50.881
3,1967,34.02,74.16,53.825
4,1972,35.4,74.72,56.53
5,1977,31.22,76.11,59.672
6,1982,38.445,77.11,62.4415
7,1987,39.906,78.67,65.834
8,1992,23.599,79.36,67.703
9,1997,36.087,80.69,69.394


In [20]:
df_p2 = df[df.country == "China"][['year', 'lifeExp']].rename(
    columns = {'lifeExp': 'China'})

In [21]:
df_p2

Unnamed: 0,year,China
288,1952,44.0
289,1957,50.54896
290,1962,44.50136
291,1967,58.38112
292,1972,63.11888
293,1977,63.96736
294,1982,65.525
295,1987,67.274
296,1992,68.69
297,1997,70.426


In [22]:
df_graph = df_p1.merge(df_p2, how = 'inner', on = 'year')

In [23]:
df_graph_long = pd.melt(
    df_graph,
    id_vars = 'year',
    var_name = 'variable',
    value_name = 'value'
)

In [24]:
px.line(df_graph_long,
        x = 'year', 
        y = 'value', 
        color = "variable")

NameError: name 'px' is not defined