In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Settings:
pd.set_option('display.width', 190)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)
pd.set_option('max_colwidth', 200)
pd.options.display.float_format = '{:.4f}'.format
plt.style.use('default')
np.set_printoptions(threshold = 30, edgeitems = 30, precision = 2, suppress = False)

In [3]:
df = pd.read_csv("../original_data/gdp_m.csv")
df = df.rename({"date": "Date", "gdp": "GDP"}, axis=1)
df.Date = pd.to_datetime(df.Date)
# df = df.set_index("Date")
df = df.drop(columns=['year'])
df.head()

Unnamed: 0,Date,GDP
0,1947-01-01,243.164
1,1947-04-01,245.968
2,1947-07-01,249.585
3,1947-10-01,259.745
4,1948-01-01,265.742


In [4]:
bci = pd.read_csv("../original_data/BCI-values.csv")
bci.Date = pd.to_datetime(bci.Date, dayfirst=True)

# log transformation
bci.BCI = bci.BCI.apply(lambda x: np.log(x))
bci.BCIg = bci.BCIg.apply(lambda x: np.log(x + abs(min(bci.BCIg)) + 0.1)) # Because there are values that are negative, we transform all values just a little bit positive to be able to make a logarithmic transformation.
bci.BCIp = bci.BCIp.apply(lambda x: np.log(x + abs(min(bci.BCIp)) + 0.1))

In [5]:
df.Date = df.Date + pd.DateOffset(months=1, days=-1)
df = df[df['Date'] > bci.Date[0] + pd.DateOffset(months=-3)]
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,Date,GDP
0,1967-01-31,844.17
1,1967-04-30,848.983
2,1967-07-31,865.233
3,1967-10-31,881.439
4,1968-01-31,909.387


In [6]:
lengths = []
for i in range(len(df) - 1):
    date1 = df['Date'][i]
    date2 = df['Date'][i + 1]
    length = len(bci[(bci.Date > date1) & (bci.Date <= date2)])
    if length != 0:
        lengths.append(length)
max(lengths)

14

In [7]:
for i in range(14):
    df[f'BCI_{i}'] = np.nan
    df[f'BCIp_{i}'] = np.nan
    df[f'BCIg_{i}'] = np.nan
    
for i in range(len(df) - 1):
    date1 = df['Date'][i]
    date2 = df['Date'][i + 1]
    temp = bci[(bci.Date > date1) & (bci.Date <= date2)]
    for idx in range(len(temp)):
        df = df.copy()
        df[f'BCI_{idx}'].iloc[i] = temp['BCI'].iloc[idx]
        df[f'BCIp_{idx}'].iloc[i] = temp['BCIp'].iloc[idx]
        df[f'BCIg_{idx}'].iloc[i] = temp['BCIg'].iloc[idx]

# TODO: better fillna
df = df.fillna(method="ffill", axis=1)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,Date,GDP,BCI_0,BCIp_0,BCIg_0,BCI_1,BCIp_1,BCIg_1,BCI_2,BCIp_2,BCIg_2,BCI_3,BCIp_3,BCIg_3,BCI_4,BCIp_4,BCIg_4,BCI_5,BCIp_5,BCIg_5,BCI_6,BCIp_6,BCIg_6,BCI_7,BCIp_7,BCIg_7,BCI_8,BCIp_8,BCIg_8,BCI_9,BCIp_9,BCIg_9,BCI_10,BCIp_10,BCIg_10,BCI_11,BCIp_11,BCIg_11,BCI_12,BCIp_12,BCIg_12,BCI_13,BCIp_13,BCIg_13
0,1967-01-31,844.17,4.6052,6.587,3.4751,4.6052,6.5863,3.4751,4.6012,6.5774,3.4751,4.6032,6.582,3.4751,4.6042,6.5852,3.4751,4.6022,6.5795,3.4751,4.6042,6.5848,3.4751,4.6042,6.5837,3.4751,4.5971,6.5672,3.4751,4.5901,6.5521,3.4751,4.5941,6.5622,3.4751,4.6032,6.5823,3.4751,3.4751,3.4751,3.4751,3.4751,3.4751,3.4751
1,1967-04-30,848.983,4.6092,6.587,3.4751,4.6102,6.587,3.4751,4.6062,6.5779,3.4751,4.6032,6.5707,3.4751,4.6072,6.5809,3.4751,4.6112,6.587,3.4751,4.6191,6.587,3.4751,4.623,6.587,3.4751,4.625,6.587,3.4751,4.6279,6.587,3.4751,4.6357,6.587,3.4751,4.6367,6.587,3.4751,4.6367,6.5867,3.4751,3.4751,3.4751,3.4751
2,1967-07-31,865.233,4.6386,6.587,3.4751,4.6386,6.5869,3.4751,4.6386,6.587,3.4751,4.6405,6.587,3.4751,4.6396,6.5842,3.4751,4.6444,6.587,3.4751,4.6473,6.587,3.4751,4.653,6.587,3.4751,4.653,6.587,3.4751,4.6511,6.5824,3.4751,4.653,6.5866,3.4751,4.6492,6.5773,3.4751,4.6492,6.5774,3.4751,3.4751,3.4751,3.4751
3,1967-10-31,881.439,4.6501,6.5791,3.4751,4.6463,6.5702,3.4751,4.6454,6.5696,3.4751,4.6511,6.5831,3.4751,4.6644,6.587,3.4751,4.67,6.587,3.4751,4.6756,6.587,3.4751,4.6766,6.587,3.4751,4.6691,6.5695,3.4751,4.6653,6.5607,3.4751,4.6663,6.5623,3.4751,4.6672,6.5643,3.4751,4.6597,6.5472,3.4751,3.4751,3.4751,3.4751
4,1968-01-31,909.387,4.654,6.5342,3.7977,4.653,6.531,3.7751,4.6463,6.5144,3.7471,4.6463,6.5146,3.7257,4.6473,6.5184,3.7136,4.6405,6.502,3.6939,4.6454,6.5137,3.6889,4.6454,6.5134,3.6839,4.6511,6.5265,3.6839,4.6625,6.5544,3.7087,4.6747,6.5827,3.74,4.6868,6.587,3.7842,4.6849,6.5833,3.8155,3.8155,3.8155,3.8155


In [8]:
# difference transformation
# df = df.diff()
# df = df.drop(index='1967-02-09')

In [9]:
# Histograms of features
features = []
for i in range(14):
    features.extend(['BCI_{i}', 'BCIp_{i}', 'BCIg{i}'])

def plot():
    for feature in features:
        df.hist(column = feature, bins = 50)
        plt.xlabel(feature,fontsize=15)
        plt.ylabel("Frequency",fontsize=15)
        plt.show()
        df.plot(y=feature)

# plot()

In [10]:
df.to_csv("../merged_data/BCI_GDP.csv")