In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from datetime import datetime,timedelta
import os
import pickle
import codecs
import re
from tqdm import tqdm_notebook as tqdm
import plotly.express as px


#For tokenizing sentences
import nltk
nltk.download('punkt')


from sklearn.decomposition import PCA

# Cte
BASE_PATH = "data/"

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\js_ma\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# CanBank dataset 

In [3]:
canbank_df = pd.read_csv(f"{BASE_PATH}\dataset\CanBank\monetary_policy_report.csv")
canbank_df['date'] = pd.to_datetime(canbank_df    ['date'], format='%Y-%m-%d')

canbank_df.head()

Unnamed: 0.1,Unnamed: 0,date,text
0,0,2021-01-20,Monetary Policy\r\nReport\r\n\r\nJanuary 2021\...
1,1,2016-10-19,MONETARY\r\nPOLICY\r\nREPORT\r\nOctober 2016\r...
2,2,2019-01-09,Monetary Policy \r\nReport\r\n\r\nJanuary 2019...
3,3,2020-10-28,Monetary Policy\r\nReport\r\n\r\nOctober 2020\...
4,4,2016-07-13,MONETARY\r\nPOLICY\r\nREPORT\r\nJuly 2016\r\n\...


# Economic Index Ca

In [4]:
def stretch_to_daybyday(df):
    df_tmp = df.copy(deep=True)

    row_iterator = df.iterrows()
    _, last = next(row_iterator)  # take first item from row_iterator
    for i, row in row_iterator:
        try:
            date_diff = (row.date - last.date).days
                # duplace row for the difference
            tmp =  pd.DataFrame([last])
            tmp =  pd.concat([tmp]* (date_diff-1), ignore_index=True)

            # update the date or each to make it increase
            for j, tmp_row in tmp.iterrows():
                tmp.loc[j, 'date']= tmp_row.date + timedelta(days=j+1)
                

            df_tmp = pd.concat([df_tmp,  tmp], ignore_index=True)

            last = row
        except:
            last = row


    df = df_tmp.sort_values(by='date', ignore_index=True )

    return df

In [5]:
CA_interest_rate_df = pd.read_csv(f"{BASE_PATH}\dataset\EconomicIndexCa\CA.-interest_rate.csv", on_bad_lines='skip')

CA_interest_rate_df['date'] = pd.to_datetime(CA_interest_rate_df    ['date'], format='%Y-%m-%d')

CA_interest_rate_df = CA_interest_rate_df.loc[CA_interest_rate_df['date'] >= datetime(2016, 1,20)]

CA_interest_rate_df = CA_interest_rate_df.sort_values(by='date')

CA_interest_rate_df.head()

Unnamed: 0.1,Unnamed: 0,date,V39078
1057,1057,2016-01-20,0.75
1058,1058,2016-01-21,0.75
1059,1059,2016-01-22,0.75
1060,1060,2016-01-25,0.75
1061,1061,2016-01-26,0.75


In [6]:
CA_interest_rate_df = stretch_to_daybyday(CA_interest_rate_df)

CA_interest_rate_df = CA_interest_rate_df.loc[CA_interest_rate_df['date'] >= '2017-01-01']


CA_interest_rate_df.head()

Unnamed: 0.1,Unnamed: 0,date,V39078
347,1304,2017-01-01,0.75
348,1305,2017-01-02,0.75
349,1306,2017-01-03,0.75
350,1307,2017-01-04,0.75
351,1308,2017-01-05,0.75


In [7]:
bcpiWeekly_df = pd.read_csv(f"{BASE_PATH}\dataset\EconomicIndexCa\BCPI_WEEKLY.csv",  skiprows=17)

bcpiWeekly_df['date'] = pd.to_datetime(bcpiWeekly_df['date'], format='%Y-%m-%d')

bcpiWeekly_df = bcpiWeekly_df.loc[bcpiWeekly_df['date'] >= '2016-1-20']

bcpiWeekly_df = bcpiWeekly_df.sort_values(by='date')


bcpiWeekly_df.head()

Unnamed: 0,date,W.BCPI,W.BCNE,W.ENER,W.MTLS,W.FOPR,W.AGRI,W.FISH
2297,2016-01-20,278.38,286.22,477.31,455.07,305.42,211.3,1229.53
2298,2016-01-27,288.88,285.78,518.26,459.76,295.64,212.25,1229.53
2299,2016-02-03,293.49,290.4,526.74,465.53,301.22,215.25,1315.53
2300,2016-02-10,284.34,292.92,486.36,472.34,305.74,214.08,1372.86
2301,2016-02-17,283.31,293.79,480.79,477.93,304.09,213.72,1372.86


In [8]:
bcpiWeekly_df = stretch_to_daybyday(bcpiWeekly_df)

bcpiWeekly_df = bcpiWeekly_df.loc[bcpiWeekly_df['date'] >= '2017-01-01']


bcpiWeekly_df.head()

Unnamed: 0,date,W.BCPI,W.BCNE,W.ENER,W.MTLS,W.FOPR,W.AGRI,W.FISH
347,2017-01-01,390.61,301.6,940.66,488.18,351.48,206.73,1239.72
348,2017-01-02,390.61,301.6,940.66,488.18,351.48,206.73,1239.72
349,2017-01-03,390.61,301.6,940.66,488.18,351.48,206.73,1239.72
350,2017-01-04,393.77,303.21,951.91,488.66,356.42,207.02,1293.84
351,2017-01-05,393.77,303.21,951.91,488.66,356.42,207.02,1293.84


In [9]:
COCWeights_df = pd.read_csv(f"{BASE_PATH}\dataset\EconomicIndexCa\Crude_Oil_Component_Weights.csv",  skiprows=13)[:-3]

COCWeights_df['date'] = pd.to_datetime(COCWeights_df['date'], format='%Y-%m-%d')

COCWeights_df = COCWeights_df.loc[COCWeights_df['date'] >= datetime(2016, 1,20)]

COCWeights_df.head()

Unnamed: 0,date,WGTS.AGRI,WGTS.BRENT,WGTS.COAL,WGTS.FISH,WGTS.FOPR,WGTS.MTLS,WGTS.NATURALGAS,WGTS.WCC,WGTS.WTI
45,2017-01-01,16.8238177488,2.6517676527,2.9369892871,1.590292638,10.8781813785,20.3316770145,6.064761703,17.1102155445,21.6122970329
46,2018-01-01,14.9335575908,2.3116602478,2.7416930455,1.3680346696,11.0796011495,20.0983899319,4.665824952,16.8821373885,25.9191010244
47,2019-01-01,15.2025083963,1.9012439808,2.4885951732,1.4277199905,8.6495949304,21.508156508,4.4604352465,19.9244286755,24.4373170989
48,2020-01-01,17.4566394794,1.5346126626,1.9577129734,1.0703766389,11.101836699,25.5073865836,5.5209373803,15.1310013257,20.7194962571
49,2021-01-01,13.2793074605,1.3630096525,2.7860052178,1.0647745061,11.2815105875,20.214660044,4.848311095,21.5173560028,23.6450654339


In [10]:
COCWeights_df = stretch_to_daybyday(COCWeights_df)

COCWeights_df.head()

Unnamed: 0,date,WGTS.AGRI,WGTS.BRENT,WGTS.COAL,WGTS.FISH,WGTS.FOPR,WGTS.MTLS,WGTS.NATURALGAS,WGTS.WCC,WGTS.WTI
0,2017-01-01,16.8238177488,2.6517676527,2.9369892871,1.590292638,10.8781813785,20.3316770145,6.064761703,17.1102155445,21.6122970329
1,2017-01-02,16.8238177488,2.6517676527,2.9369892871,1.590292638,10.8781813785,20.3316770145,6.064761703,17.1102155445,21.6122970329
2,2017-01-03,16.8238177488,2.6517676527,2.9369892871,1.590292638,10.8781813785,20.3316770145,6.064761703,17.1102155445,21.6122970329
3,2017-01-04,16.8238177488,2.6517676527,2.9369892871,1.590292638,10.8781813785,20.3316770145,6.064761703,17.1102155445,21.6122970329
4,2017-01-05,16.8238177488,2.6517676527,2.9369892871,1.590292638,10.8781813785,20.3316770145,6.064761703,17.1102155445,21.6122970329


## Concat dataframe

In [11]:
indices_df = pd.concat([ CA_interest_rate_df.set_index('date'), bcpiWeekly_df.set_index('date'),COCWeights_df.set_index('date')], axis=1, join='inner')
indices_df.head()

Unnamed: 0_level_0,Unnamed: 0,V39078,W.BCPI,W.BCNE,W.ENER,W.MTLS,W.FOPR,W.AGRI,W.FISH,WGTS.AGRI,WGTS.BRENT,WGTS.COAL,WGTS.FISH,WGTS.FOPR,WGTS.MTLS,WGTS.NATURALGAS,WGTS.WCC,WGTS.WTI
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2017-01-01,1304,0.75,390.61,301.6,940.66,488.18,351.48,206.73,1239.72,16.8238177488,2.6517676527,2.9369892871,1.590292638,10.8781813785,20.3316770145,6.064761703,17.1102155445,21.6122970329
2017-01-02,1305,0.75,390.61,301.6,940.66,488.18,351.48,206.73,1239.72,16.8238177488,2.6517676527,2.9369892871,1.590292638,10.8781813785,20.3316770145,6.064761703,17.1102155445,21.6122970329
2017-01-03,1306,0.75,390.61,301.6,940.66,488.18,351.48,206.73,1239.72,16.8238177488,2.6517676527,2.9369892871,1.590292638,10.8781813785,20.3316770145,6.064761703,17.1102155445,21.6122970329
2017-01-04,1307,0.75,393.77,303.21,951.91,488.66,356.42,207.02,1293.84,16.8238177488,2.6517676527,2.9369892871,1.590292638,10.8781813785,20.3316770145,6.064761703,17.1102155445,21.6122970329
2017-01-05,1308,0.75,393.77,303.21,951.91,488.66,356.42,207.02,1293.84,16.8238177488,2.6517676527,2.9369892871,1.590292638,10.8781813785,20.3316770145,6.064761703,17.1102155445,21.6122970329


In [12]:
indices_df.to_numpy()

array([[1304, 0.75, 390.61, ..., '6.0647617030', '17.1102155445',
        '21.6122970329'],
       [1305, 0.75, 390.61, ..., '6.0647617030', '17.1102155445',
        '21.6122970329'],
       [1306, 0.75, 390.61, ..., '6.0647617030', '17.1102155445',
        '21.6122970329'],
       ...,
       [2608, 0.5, 652.18, ..., '4.8483110950', '21.5173560028',
        '23.6450654339'],
       [2609, 0.5, 652.18, ..., '4.8483110950', '21.5173560028',
        '23.6450654339'],
       [2609, 0.5, 652.18, ..., '6.7587836141', '24.7020281208',
        '26.0219420483']], dtype=object)

# PCA

In [13]:
pca = PCA(n_components='mle')
indices_df['pca_vec'] = pca.fit_transform(indices_df.to_numpy()).tolist()
indices_df.pca_vec

date
2017-01-01    [-661.4044001535486, -194.31488693869585, -37....
2017-01-02    [-660.5515778131081, -193.85149436714698, -37....
2017-01-03    [-659.6987554726717, -193.38810179559056, -37....
2017-01-04    [-638.3136829542099, -224.3691571850476, -69.3...
2017-01-05    [-637.4608606137723, -223.90576461349437, -69....
                                    ...                        
2021-12-28    [926.7373595677682, -240.72737881339606, -16.3...
2021-12-29    [961.3904857062244, -308.89584063574955, 19.11...
2021-12-30    [962.2433080466618, -308.4324480641961, 19.085...
2021-12-31    [963.0961303870998, -307.96905549264284, 19.05...
2022-01-01    [963.1083144338832, -308.0160273339823, 19.080...
Freq: D, Name: pca_vec, Length: 1827, dtype: object

In [14]:
indices_df.to_csv("data/result/indices_PCA.csv", index=False)

In [None]:

# features = indices_df.columns.tolist()

# pca = PCA()
# components = pca.fit_transform(indices_df.to_numpy())
# labels = {
#     str(i): f"PC {i+1} ({var:.1f}%)"
#     for i, var in enumerate(pca.explained_variance_ratio_ * 100)
# }

# fig = px.scatter_matrix(
#     components,
#     labels=labels,
#     dimensions=range(4),
#     # color=indices_df["species"]
# )
# fig.update_traces(diagonal_visible=False)
# fig.show()