In [1]:
#import/instalations
!pip install yfinance
import yfinance as yf
import pandas as pd
import requests

Collecting yfinance
  Downloading yfinance-0.2.43-py2.py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.6/84.6 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting requests>=2.31
  Downloading requests-2.31.0-py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.6/62.6 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Collecting multitasking>=0.0.7
  Downloading multitasking-0.0.11-py3-none-any.whl (8.5 kB)
Collecting peewee>=3.16.2
  Downloading peewee-3.17.6.tar.gz (3.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m37.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: peewee
  Building wheel for peewee (pyproject.toml) ... [?25ldone
[?25h

# Forex data

In [2]:
#FUNCTIONS

#Download forex data for 1 pair from last 7 days in 1 minute intervals 
def download_data(cur_1, cur_2, t_period='5d', t_interval='1m'):
    #DOWNLOAD DATA
    data = yf.Ticker(f'{cur_1}{cur_2}=X')
    f_data = data.history(period=t_period, interval=t_interval)
    #REMOVE TABLES WITH 0
    mask = f_data.ne(0).any(axis=0)
    f_data = f_data.loc[:, mask]
    
    f_data.reset_index(inplace=True)
    #RENAME COLUMNS
    f_data.rename(columns={'Datetime': 'timestamp', 'Open':f'{cur_1}{cur_2}_OPEN', 'High':f'{cur_1}{cur_2}_HIGH', 
                         'Low':f'{cur_1}{cur_2}_LOW', 'Close':f'{cur_1}{cur_2}_CLOSE'}, inplace=True)
    
    
    f_data['timestamp'] = f_data['timestamp'].apply(lambda x: x.timestamp()).astype(int)
    f_data.set_index('timestamp', inplace=True)
    return f_data

#Download new forex data and merge it with old one
def update_forex_data(old_data_path):
    #LOAD OLD DATA GMT +1
    old_data = pd.read_feather(old_data_path)
    old_data.set_index('timestamp', inplace=True)

    #CURRENCIES PAIRS WHICH WE WANT DOWNLOAD
    main_cur = ['PLN', 'EUR']
    additional_cur = ['CZK', 'HUF', 'USD', 'CHF', 'GBP', 'JPY']

    #DOWNLOAD OUR MAIN CURRENCY PAIR
    forex_data = download_data('EUR', 'PLN')

    #DOWNLOAD OUR ADDITIONAL CURRENCY PAIRS
    for main in main_cur:
        for add in additional_cur:
            #DOWNLOAD PAIR DATA
            temp_data = download_data(main, add)
            #JOIN TO MAIN TABLE
            forex_data = forex_data.join(temp_data)

    #JOIN OLD AND NEW DATA(FROM THIS WEEK)
    forex_data = pd.concat([forex_data, old_data])

    #remove duplicates and sort values
    forex_data.sort_index(inplace=True)
    forex_data.reset_index(inplace=True)
    forex_data.drop_duplicates(['timestamp'], inplace=True)
    forex_data.reset_index(inplace=True)
    forex_data.drop('index', axis=1, inplace=True)
    
    #SAVE TO feather
    forex_data.to_feather('forex_data.feather')
    
#Download daily forex data - DEPRECIATED use "download_data"
def daily_forex_data(pair="EUR/PLN", interval="1day", size="5000"):
    url = "https://twelve-data1.p.rapidapi.com/time_series"
    querystring = {"symbol":pair,"interval":interval,"outputsize":size,"format":"json"}

    headers = {
        "X-RapidAPI-Key": "863b6e82d7msha3b96a4e153c426p11a206jsn073ad98d5070",
        "X-RapidAPI-Host": "twelve-data1.p.rapidapi.com"
    }

    response = requests.get(url, headers=headers, params=querystring).json()
    return pd.DataFrame(response['values'])    

In [3]:
#Update forex data
forex_path = '/kaggle/input/forex-data-gatherer/forex_data.feather'
update_forex_data(forex_path)

In [None]:
#Daily forex data
daily_forex = download_data('EUR', 'PLN', '5000d', '1d')

# Economic calendar data

In [None]:
#import
import pandas as pd
import numpy as np
import requests

In [None]:
#Create dates pairs for economic calendar(calendar max offset is 30 days)
dates_pairs = []
initial_date = "2010-01-01"
end_date_init = pd.Timestamp.today().normalize()
start_date = pd.to_datetime(initial_date)

while start_date <= end_date_init:
    end_date = (start_date + pd.offsets.Day(30))
    dates_pairs.append((start_date.date().isoformat(), end_date.date().isoformat()))
    start_date = end_date

In [None]:
#Create and save economic calendar with all possible fetched data
df_list = []
for pair in dates_pairs:
    try:
        url = 'https://economic-calendar.tradingview.com/events'
        payload = {
            'from': pair[0],
            'to': pair[1]
        }
        data = requests.get(url, params=payload).json()
        calendar_df = pd.DataFrame(data['result'])

        #Drop unimportant columns and rows
        calendar_drop = ['id', 'period', 'source', 'currency', 'ticker', 'unit', 'scale']
        calendar_df.drop(calendar_drop, axis=1, inplace=True)
        calendar_df.dropna(subset=['actual'], inplace=True)

        #Sort and convert time to GMT +1
        calendar_df['date'] = pd.to_datetime(calendar_df['date'], dayfirst=True).dt.tz_convert('Europe/London')
        calendar_df['timestamp'] = calendar_df['date'].apply(lambda x: x.timestamp()).astype(int)
        calendar_df.set_index('timestamp', inplace=True)
        calendar_df.sort_index(inplace=True)
        df_list.append(calendar_df)
    except Exception as e:
        pass
    
economic_data = pd.concat(df_list)
economic_data.reset_index(inplace=True)
economic_data.to_feather("economic_data.feather")    

# EDA

In [None]:
forex_data = pd.read_feather('/kaggle/input/forex-data-gatherer/forex_data.feather')

In [None]:
#Economic calendar d.aggregateta
economic_df = pd.read_feather('/kaggle/input/economic-calendar-data/economic_data.feather')

In [None]:
economic_poland = economic_df[economic_df['country']=='PL'].copy()
economic_poland.reset_index(inplace=True)
economic_poland.drop('index', inplace=True, axis=1)

In [None]:
# fix some data(sometimes records from previous months are saved next month as duplicates)
economic_poland = economic_df[economic_df['country']=='PL'].copy()

mask = economic_poland.duplicated(subset=['date', 'title'], keep=False)
for idx, [index, row] in enumerate(economic_poland.loc[mask].iterrows()):
    try:
        pair = economic_poland.loc[mask].iloc[idx+1]
        if all(row[['date', 'title']] == pair[['date', 'title']]):
            if row['actual'] == pair['previous']:
                new_date = pd.to_datetime(row['date'])
                time_diff = pd.to_datetime(economic_poland[economic_poland['title'] == row['title']]['date']).diff().dt.days.median()
                new_date = new_date - pd.Timedelta(days=time_diff)
                economic_poland.loc[[index], ['date']] = new_date.strftime('%Y-%m-%d')
            else:
                new_date = pd.to_datetime(pair['date'])
                time_diff = pd.to_datetime(economic_poland[economic_poland['title'] == pair['title']]['date']).diff().dt.days.median()
                new_date = new_date - pd.Timedelta(days=time_diff)
                economic_poland.loc[[pair.name], ['date']] = new_date.strftime('%Y-%m-%d')
    except Exception as e:
        continue
        #print(e)

In [None]:
#Create new df with continues range of dates and all indicator values in any day

df_pivot = economic_poland.pivot(columns='title', values='actual')

#merge tables to update dates
merged = df_pivot.merge(economic_poland, left_index=True, right_index=True)
merged.drop(['title', 'country', 'indicator', 'comment','actual', 'previous', 'forecast', 'importance'], axis=1, inplace=True)
merged.reset_index(inplace=True)

#create continues dates from oldest to newest 
idx = pd.date_range(merged.date.min(), merged.date.max())
idx = idx.strftime('%Y-%m-%d')

#create new dataframe with full set of date range
new_df = pd.DataFrame(index=idx, columns=merged.columns)
new_df.drop('date', axis=1, inplace=True)

# change date column to str to be merge
merged['date'] = merged['date'].apply(lambda x: x.strftime('%Y-%m-%d'))

#merge both dataframes table with values in full set of ranges
fullset = pd.merge(new_df, merged, how='left', right_on='date', left_index=True, suffixes=("_x", None))
fullset.dropna(axis=1, how='all', inplace=True)

#fill all nans with values from previous rows(newest)
fullset = fullset.ffill()

#drop duplicates
fullset.drop_duplicates(subset=['date'], inplace=True, keep='last')
fullset.set_index('date', inplace=True)
fullset.drop('index', inplace=True, axis=1)

#fill rest of nan values(oldest data) with oldest 'previous' value from main df - check if it is not bettter to leave nan
for col in fullset.columns:
    if pd.isna(fullset[col].iloc[0]):
        value = economic_poland.loc[economic_poland['title'] == col].iloc[0]['previous']
        fullset[col] = fullset[col].fillna(value)
        
#check if last row in our dataframe is correct
test_df = pd.DataFrame()
for title in economic_poland['title'].unique():
    test_df[title] = [economic_poland[economic_poland["title"]==title].iloc[-1]["actual"]]
    
test_true = fullset.drop('timestamp', axis=1).iloc[-1] == test_df
print(test_true.iloc[0].unique()) #it should only contain "True" values

In [None]:
fullset

In [None]:
error = []
for title in economic_poland.title.unique():
    all_rows = economic_poland[economic_poland.title == title]
    for index, [idx, row] in enumerate(all_rows.iterrows()):
        try:
            now = row['date']
            until = pd.to_datetime(all_rows.iloc[index+1]['date'])
            until = until - pd.Timedelta(days=1)
            until = until.strftime('%Y-%m-%d')
        except:
            pass
        if now > until:
            temp1 = now
            now = until
            until = temp1
        actual = row['actual']
        if all(actual != fullset.loc[now:until][indicator]):
            error.append([now, until, actual, fullset.loc[now][indicator], indicator])

In [None]:
#merge both dataframes table with values in full set of ranges
fullset2 = pd.merge(new_df, merged, how='left', right_on='date', left_index=True, suffixes=("_x", None))
fullset2.dropna(axis=1, how='all', inplace=True)
fullset2.set_index('date', inplace=True)

In [None]:
fullset['Inflation Rate YoY Final']['2016-03-01':'2016-03-30']

In [None]:
mask = economic_poland.duplicated(subset=['date', 'title'], keep=False)
economic_poland.loc[mask]

In [None]:
economic_poland[economic_poland['date'] == '2016-03-15']

In [None]:
fullset2['Inflation Rate YoY Final'].dropna()[0:20]

In [None]:
fullset2['Inflation Rate YoY Final']['2016-03-01':'2016-03-30']

In [None]:
#Forex data
forex_df = pd.read_excel('/kaggle/input/forex-data-gatherer/FOREX_DATA.xlsx')

forex_df['Datetime'] = pd.to_datetime(forex_df['Datetime'], dayfirst=True)
forex_df = dataframe.sort_values(by='Datetime', ascending=False)
forex_df['Datetime'] = forex_df["Datetime"].dt.strftime('%d-%m-%Y %H:%M:%S %z')
forex_df.set_index('Datetime', inplace=True)

In [None]:
join_df = dataframe.join(calendar_df).drop_duplicates()
join_df = join_df[~join_df.index.duplicated(keep='first')]

In [None]:
join_df.loc['28-03-2023 07:00:00 +0100']