# Problem set 3: Loading and structuring data from Denmark Statistics

[<img src="https://mybinder.org/badge_logo.svg">](https://mybinder.org/v2/gh/NumEconCopenhagen/exercises-2020/master?urlpath=lab/tree/PS3/problem_set_3.ipynb)

In [2]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
import pandas as pd
import ipywidgets as widgets

# Tasks

## Create a pandas DataFrame

Modify the code below such that *income* and *consumption* are variables in the *dt* DataFrame.

In [35]:
np.random.seed(1999)
 
N = 100
mydata = {}
mydata['id'] = range(N)
income = np.exp(np.random.normal(size=N))
consumption = np.sqrt(income)
mydata['income'] = income
mydata['consumption'] = consumption

dt = pd.DataFrame(mydata)
dt.head()

Unnamed: 0,id,income,consumption
0,0,0.727981,0.853218
1,1,1.997831,1.413447
2,2,0.276823,0.52614
3,3,1.481931,1.217346
4,4,1.235904,1.111712


**Answer:** see A01.py

## Create new variable

1) Add a new variable *ratio* which is the ratio of consumption to income.

In [36]:
dt['ratio'] = mydata['consumption'] / mydata['income']
dt.head()

Unnamed: 0,id,income,consumption,ratio
0,0,0.727981,0.853218,1.172033
1,1,1.997831,1.413447,0.70749
2,2,0.276823,0.52614,1.900636
3,3,1.481931,1.217346,0.821459
4,4,1.235904,1.111712,0.899513


**Answer:** See A02.py

## Summary statistics

Produce summary statistics using `.describe()`.

In [37]:
dt.describe()

Unnamed: 0,id,income,consumption,ratio
count,100.0,100.0,100.0,100.0
mean,49.5,1.415547,1.087844,1.117517
std,29.011492,1.322203,0.484238,0.525452
min,0.0,0.108402,0.329245,0.351134
25%,24.75,0.529323,0.727545,0.75231
50%,49.5,0.981178,0.990533,1.00958
75%,74.25,1.768211,1.329572,1.374491
max,99.0,8.110612,2.847914,3.037255


**Answer:** See A03.py

## Indexing

Select everybody with an income above 1.

In [38]:
I = dt['income'] > 1
dt1 = dt.loc[I]
dt1.head()

Unnamed: 0,id,income,consumption,ratio
1,1,1.997831,1.413447,0.70749
3,3,1.481931,1.217346,0.821459
4,4,1.235904,1.111712,0.899513
6,6,2.574032,1.604379,0.623294
7,7,2.475478,1.573365,0.63558


**Answer:** See A04.py

Select everybody with an income *above* 1 and a ratio *above* 0.7.

In [39]:
I = ( dt['income'] > 1) & ( dt['ratio'] > 0.7)
dt2 = dt.loc[I]
dt2.head()

Unnamed: 0,id,income,consumption,ratio
1,1,1.997831,1.413447,0.70749
3,3,1.481931,1.217346,0.821459
4,4,1.235904,1.111712,0.899513
11,11,2.031708,1.42538,0.701567
18,18,1.280235,1.131475,0.883802


**Answer:** See A05.py

Set consumption equal to 0.5 if income is less than 0.5.

In [40]:
I = dt['income'] < 0.5
dt.loc[I, ['consumption']] = 0.5
dt['consumption'].mean()
dt

Unnamed: 0,id,income,consumption,ratio
0,0,0.727981,0.853218,1.172033
1,1,1.997831,1.413447,0.707490
2,2,0.276823,0.500000,1.900636
3,3,1.481931,1.217346,0.821459
4,4,1.235904,1.111712,0.899513
...,...,...,...,...
95,95,0.201856,0.500000,2.225764
96,96,2.368034,1.538842,0.649839
97,97,2.389874,1.545922,0.646863
98,98,1.488635,1.220096,0.819607


**Answer:**  See A06.py

Set consumption equal to income if income is less than 0.5.

In [41]:
I = dt['income'] < 0.5
#dt.loc[I, ['consumption']] = dt.loc[I, ['income']].values
dt.loc[I, ['consumption']] = dt['income']
dt['consumption'].mean()
dt

Unnamed: 0,id,income,consumption,ratio
0,0,0.727981,0.853218,1.172033
1,1,1.997831,1.413447,0.707490
2,2,0.276823,0.276823,1.900636
3,3,1.481931,1.217346,0.821459
4,4,1.235904,1.111712,0.899513
...,...,...,...,...
95,95,0.201856,0.201856,2.225764
96,96,2.368034,1.538842,0.649839
97,97,2.389874,1.545922,0.646863
98,98,1.488635,1.220096,0.819607


**Answer:** See A07.py

## Dropping

Drop the *ratio* variable and all rows with an income above 1.5. After this, also drop the first 5 rows.

In [48]:
dt_now = dt.copy()

I = dt_now['income'] > 1.5
dt_now = dt_now.drop(dt_now[I].index)
print(f'before: {dt_now.shape[0]} observations, {dt_now.shape[1]} variables')
dt_now = dt_now.iloc[5:]
print(f'after: {dt_now.shape[0]} observations, {dt_now.shape[1]} variables')

before: 70 observations, 4 variables
after: 65 observations, 4 variables


**Answer:** see A08.py

## Renaming

Rename *consumption* to *cons* and *income* to *inc*.

In [68]:
dt = dt.rename(columns = {'consumption': 'cons', 'income': 'inc'})
dt.head()

Unnamed: 0,id,inc,cons,ratio,assets_1,assets_2,assets_4
0,0,0.727981,0.853218,1.172033,0.849716,0.849716,-0.125237
1,1,1.997831,1.413447,0.70749,1.701262,1.701262,0.584385
2,2,0.276823,0.276823,1.900636,1.0,1.0,0.0
3,3,1.481931,1.217346,0.821459,1.317502,1.317502,0.264585
4,4,1.235904,1.111712,0.899513,1.14903,1.14903,0.124192


**Answer:** see A09.py

## Functions

Correct the wrong lines such that `assets_1 = assets_2 = assets_3 = assets_4`.

In [81]:
def assets_row_by_row(x,R,Y):
    return R * (x['inc'] - x['cons']) + Y
    
def assets_all_at_once(income,consumption,R,Y):
    return R * (income - consumption) + Y

def assets_adj(assets,R,Y):
    assets = 1

R = 1.2 # return rate
Y = 1 # income

dt['assets_1'] = R*(dt['inc']-dt['cons'])+Y
dt['assets_2'] = dt.apply(assets_row_by_row,axis=1,args=(R,Y))
dt['assets_3'] = assets_all_at_once(dt['inc'].values,dt['cons'].values,R,Y)
dt['assets_4'] = dt['inc']-dt['cons']
assets_adj(dt['assets_4'],R,Y)
dt.head()    

Unnamed: 0,id,inc,cons,ratio,assets_1,assets_2,assets_4,assets_3
0,0,0.727981,0.853218,1.172033,0.849716,0.849716,-0.125237,0.849716
1,1,1.997831,1.413447,0.70749,1.701262,1.701262,0.584385,1.701262
2,2,0.276823,0.276823,1.900636,1.0,1.0,0.0,1.0
3,3,1.481931,1.217346,0.821459,1.317502,1.317502,0.264585,1.317502
4,4,1.235904,1.111712,0.899513,1.14903,1.14903,0.124192,1.14903


**Answer:** see A10.py

# Problem

Load the data set in *data/NAH1_pivoted.xlsx* and clean and structure it such that the `plot_timeseries(dataframe)` below can be run and produce an interactive figure. 

In [82]:
def _plot_timeseries(dataframe, variable, years):
    
    fig = plt.figure(dpi=100)
    ax = fig.add_subplot(1,1,1)
    
    dataframe.loc[:,['year']] = pd.to_numeric(dataframe['year'])
    I = (dataframe['year'] >= years[0]) & (dataframe['year'] <= years[1])
    
    x = dataframe.loc[I,'year']
    y = dataframe.loc[I,variable]
    ax.plot(x,y)
    
    ax.set_xticks(list(range(years[0], years[1] + 1, 5)))    
    
def plot_timeseries(dataframe):
    
    widgets.interact(_plot_timeseries, 
    dataframe = widgets.fixed(dataframe),
    variable = widgets.Dropdown(
        description='variable', 
        options=['Y','C','G','I','X','M'], 
        value='Y'),
    years=widgets.IntRangeSlider(
        description="years",
        min=1966,
        max=2018,
        value=[1980, 2018],
        continuous_update=False,
    )                 
); 

**Hint 1:** You can base your renaming on this dictionary:

In [83]:
rename_dict = {}
rename_dict['P.1 Output'] = 'Y'
rename_dict['P.3 Final consumption expenditure'] = 'C'
rename_dict['P.3 Government consumption expenditure'] = 'G'
rename_dict['P.5 Gross capital formation'] = 'I'
rename_dict['P.6 Export of goods and services'] = 'X'
rename_dict['P.7 Import of goods and services'] = 'M'

**Hint 2:** You code should have the following structure:

In [99]:
# a. load data set
nah1 = pd.read_excel('data/NAH1_pivoted.xlsx',skiprows=2)

# b. rename variables
rename_dict['Unnamed: 0'] = 'year'
nah1.rename(columns = rename_dict, inplace = True)

# c. remove rows where Y is nan
I = nah1['Y'].notna()
nah1 = nah1[I]
nah1
# d. correct year column data
I = nah1['year'].notna()
J = nah1['year'].isna()
nah1.loc[J,['year']] = nah1.loc[I,['year']].values

# e. only keep rows with '2010-prices, chained values'
I = nah1['Unnamed: 1'] == '2010-prices, chained values'
nah1 = nah1[I]

# f. only keep renamed variables
nah1 = nah1.loc[:, list(rename_dict.values())]

# g. interactive plot
plot_timeseries(nah1)

     year                   Unnamed: 1          Y P.11 Market output  \
0    1966               Current prices   144582.0                 ..   
1    1966  2010-prices, chained values  1089347.0                 ..   
2    1967               Current prices   155581.0                 ..   
3    1967  2010-prices, chained values  1124579.0                 ..   
4    1968               Current prices   167877.0                 ..   
..    ...                          ...        ...                ...   
101  2016  2010-prices, chained values  3454769.0                 ..   
102  2017               Current prices  3761163.0                 ..   
103  2017  2010-prices, chained values  3533677.0                 ..   
104  2018               Current prices  3853990.0                 ..   
105  2018  2010-prices, chained values  3570475.0                 ..   

    P.12 Output for own final use P.13 Non-market output  \
0                              ..                     ..   
1              

interactive(children=(Dropdown(description='variable', options=('Y', 'C', 'G', 'I', 'X', 'M'), value='Y'), Int…

**Answer:** see A11.py

In [None]:
plot_timeseries(nah1)

# Extra problems

## Extend interactive plot

Extend the interactive plot with a choice of *real* vs *nominal*.

In [102]:
def _plot_timeseries(dataframe_dict, df_type, variable, years):
    
    fig = plt.figure(dpi=100)
    ax = fig.add_subplot(1,1,1)
    
    dataframe = dataframe_dict[df_type]
    dataframe.loc[:,['year']] = pd.to_numeric(dataframe['year'])
    I = (dataframe['year'] >= years[0]) & (dataframe['year'] <= years[1])
    
    x = dataframe.loc[I,'year']
    y = dataframe.loc[I,variable]
    ax.plot(x,y)
    
    ax.set_xticks(list(range(years[0], years[1] + 1, 5)))    
    
def plot_timeseries(dataframe_dict):
    
    widgets.interact(_plot_timeseries, 
    dataframe_dict = widgets.fixed(dataframe_dict),
    df_type = widgets.Dropdown(
        description='real or nominal', 
        options=['Real', 'Nominal'], 
        value='Real'),
    variable = widgets.Dropdown(
        description='variable', 
        options=['Y','C','G','I','X','M'], 
        value='Y'),
    years=widgets.IntRangeSlider(
        description="years",
        min=1966,
        max=2018,
        value=[1980, 2018],
        continuous_update=False,
    )                 
); 

In [103]:
# a. load data set
nah1 = pd.read_excel('data/NAH1_pivoted.xlsx',skiprows=2)

# b. rename variables
rename_dict['Unnamed: 0'] = 'year'
nah1.rename(columns = rename_dict, inplace = True)

# c. remove rows where Y is nan
I = nah1['Y'].notna()
nah1 = nah1[I]
nah1
# d. correct year column data
I = nah1['year'].notna()
J = nah1['year'].isna()
nah1.loc[J,['year']] = nah1.loc[I,['year']].values
nah1

# e. only keep rows with '2010-prices, chained values'
I = nah1['Unnamed: 1'] == '2010-prices, chained values'
J = nah1['Unnamed: 1'] == 'Current prices'
nah_chained = nah1[I]
nah1_current = nah1[J]

# f. only keep renamed variables
nah_chained = nah_chained.loc[:, list(rename_dict.values())]
nah1_current = nah1_current.loc[:, list(rename_dict.values())]

# g. interactive plot
dataframe_dict = {"Real": nah_chained, "Nominal": nah1_current}

plot_timeseries(dataframe_dict)

interactive(children=(Dropdown(description='real or nominal', options=('Real', 'Nominal'), value='Real'), Drop…

Load data from an Excel or CSV file you have downloaded from e.g. [Statistikbanken.dk](https://www.statistikbanken.dk/). Clean, structure and present the data as you see fit.