In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Settings:
pd.set_option('display.width', 190)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)
pd.set_option('max_colwidth', 200)
pd.options.display.float_format = '{:.4f}'.format
plt.style.use('default')
np.set_printoptions(threshold = 30, edgeitems = 30, precision = 2, suppress = False)

In [3]:
df = pd.read_csv("../original_data/gdp_m.csv")
df = df.rename({"date": "Date", "gdp": "GDP"}, axis=1)
df.Date = pd.to_datetime(df.Date)
# df = df.set_index("Date")
df = df.drop(columns=['year'])
df.head()

Unnamed: 0,Date,GDP
0,1947-01-01,243.164
1,1947-04-01,245.968
2,1947-07-01,249.585
3,1947-10-01,259.745
4,1948-01-01,265.742


In [4]:
bci = pd.read_csv("../original_data/BCI-values.csv")
bci.Date = pd.to_datetime(bci.Date, dayfirst=True)

# log transformation
# bci.BCI = bci.BCI.apply(lambda x: np.log(x))
# bci.BCIg = bci.BCIg.apply(lambda x: np.log(x + abs(min(bci.BCIg)) + 0.1)) # Because there are values that are negative, we transform all values just a little bit positive to be able to make a logarithmic transformation.
# bci.BCIp = bci.BCIp.apply(lambda x: np.log(x + abs(min(bci.BCIp)) + 0.1))

In [5]:
df.Date = df.Date + pd.DateOffset(months=1, days=-1)
df = df[df['Date'] > bci.Date[0] + pd.DateOffset(months=-3)]
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,Date,GDP
0,1967-01-31,844.17
1,1967-04-30,848.983
2,1967-07-31,865.233
3,1967-10-31,881.439
4,1968-01-31,909.387


In [6]:
lengths = []
for i in range(len(df) - 1):
    date1 = df['Date'][i]
    date2 = df['Date'][i + 1]
    length = len(bci[(bci.Date > date1) & (bci.Date <= date2)])
    if length != 0:
        lengths.append(length)
max(lengths)

14

In [7]:
for i in range(14):
    df[f'BCI_{i}'] = np.nan
    df[f'BCIp_{i}'] = np.nan
    df[f'BCIg_{i}'] = np.nan
    
for i in range(len(df) - 1):
    date1 = df['Date'][i]
    date2 = df['Date'][i + 1]
    temp = bci[(bci.Date > date1) & (bci.Date <= date2)]
    for idx in range(len(temp)):
        df = df.copy()
        df[f'BCI_{idx}'].iloc[i] = temp['BCI'].iloc[idx]
        df[f'BCIp_{idx}'].iloc[i] = temp['BCIp'].iloc[idx]
        df[f'BCIg_{idx}'].iloc[i] = temp['BCIg'].iloc[idx]

# TODO: better fillna
df = df.fillna(method="ffill", axis=1)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,Date,GDP,BCI_0,BCIp_0,BCIg_0,BCI_1,BCIp_1,BCIg_1,BCI_2,BCIp_2,BCIg_2,BCI_3,BCIp_3,BCIg_3,BCI_4,BCIp_4,BCIg_4,BCI_5,BCIp_5,BCIg_5,BCI_6,BCIp_6,BCIg_6,BCI_7,BCIp_7,BCIg_7,BCI_8,BCIp_8,BCIg_8,BCI_9,BCIp_9,BCIg_9,BCI_10,BCIp_10,BCIg_10,BCI_11,BCIp_11,BCIg_11,BCI_12,BCIp_12,BCIg_12,BCI_13,BCIp_13,BCIg_13
0,1967-01-31,844.17,100.0,100.0,0.0,100.0,99.5,0.0,99.6,93.1,0.0,99.8,96.4,0.0,99.9,98.7,0.0,99.7,94.6,0.0,99.9,98.4,0.0,99.9,97.6,0.0,99.2,85.8,0.0,98.5,75.1,0.0,98.9,82.2,0.0,99.8,96.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1967-04-30,848.983,100.4,100.0,0.0,100.5,100.0,0.0,100.1,93.4,0.0,99.8,88.3,0.0,100.2,95.6,0.0,100.6,100.0,0.0,101.4,100.0,0.0,101.8,100.0,0.0,102.0,100.0,0.0,102.3,100.0,0.0,103.1,100.0,0.0,103.2,100.0,0.0,103.2,99.8,0.0,0.0,0.0,0.0
2,1967-07-31,865.233,103.4,100.0,0.0,103.4,99.9,0.0,103.4,100.0,0.0,103.6,100.0,0.0,103.5,98.0,0.0,104.0,100.0,0.0,104.3,100.0,0.0,104.9,100.0,0.0,104.9,100.0,0.0,104.7,96.7,0.0,104.9,99.7,0.0,104.5,93.0,0.0,104.5,93.1,0.0,0.0,0.0,0.0
3,1967-10-31,881.439,104.6,94.3,0.0,104.2,87.9,0.0,104.1,87.5,0.0,104.7,97.2,0.0,106.1,100.0,0.0,106.7,100.0,0.0,107.3,100.0,0.0,107.4,100.0,0.0,106.6,87.4,0.0,106.2,81.2,0.0,106.3,82.3,0.0,106.4,83.7,0.0,105.6,71.7,0.0,0.0,0.0,0.0
4,1968-01-31,909.387,105.0,62.7,12.3,104.9,60.5,11.3,104.2,49.2,10.1,104.2,49.3,9.2,104.3,51.9,8.7,103.6,40.9,7.9,104.1,48.7,7.7,104.1,48.5,7.5,104.7,57.4,7.5,105.9,76.7,8.5,107.2,96.9,9.8,108.5,100.0,11.7,108.3,97.3,13.1,13.1,13.1,13.1


In [8]:
# difference transformation
# df = df.diff()
# df = df.drop(index='1967-02-09')

In [9]:
# Histograms of features
features = []
for i in range(14):
    features.extend(['BCI_{i}', 'BCIp_{i}', 'BCIg{i}'])

def plot():
    for feature in features:
        df.hist(column = feature, bins = 50)
        plt.xlabel(feature,fontsize=15)
        plt.ylabel("Frequency",fontsize=15)
        plt.show()
        df.plot(y=feature)

# plot()

In [10]:
df.to_csv("../merged_data/BCI_GDP.csv")