In [1]:
# Instalations
!python -m pip install --upgrade pip
!pip install yfinance==0.2.59
!pip install curl-cffi
#!pip install yfinance

Collecting pip
  Downloading pip-25.2-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.2-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.2
Collecting yfinance==0.2.59
  Downloading yfinance-0.2.59-py2.py3-none-any.whl.metadata (5.7 kB)
Collecting protobuf<6,>=5.29.0 (from yfinance==0.2.59)
  Downloading protobuf-5.29.5-cp38-abi3-manylinux2014_x86_64.whl.metadata (592 bytes)
Downloading yfinance-0.2.59-py2.py3-none-any.whl (117 kB)
Downloading protobuf-5.29.5-cp38-abi3-manylinux2014_x86_64.whl (319 kB)
Installing collected packages: protobuf, yfinance
[2K  Attempting uninstall: protobuf
[2K    Found existing installation: protobuf 3.20.3
[2K    Uninstalling p

In [2]:
# Imports
import yfinance as yf
import pandas as pd
import numpy as np
import requests
import warnings


from tqdm.auto import tqdm
from curl_cffi import requests
from kaggle_secrets import UserSecretsClient

# load secret key
user_secrets = UserSecretsClient()
forex_api = user_secrets.get_secret("Forex data API")

# disable warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

# Forex data

In [3]:
class ForexDataHandler:
    def __init__(self, main_currencies=['PLN', 'EUR'], additional_currencies=['CZK', 'HUF', 'USD', 'CHF', 'GBP', 'JPY']):
        self.main_currencies = main_currencies
        self.additional_currencies = additional_currencies
        self.session = requests.Session(impersonate="chrome", timeout=5)

    def download_data(self, cur_1, cur_2, t_period='5d', t_interval='1m'):
        symbol = f'{cur_1}{cur_2}=X'
        data = yf.Ticker(symbol, session=self.session)
        f_data = data.history(period=t_period, interval=t_interval)

        mask = f_data.ne(0).any(axis=0)
        f_data = f_data.loc[:, mask]

        f_data.reset_index(inplace=True)
        f_data.rename(columns={
            'Datetime': 'timestamp',
            'Open': f'{cur_1}{cur_2}_OPEN',
            'High': f'{cur_1}{cur_2}_HIGH',
            'Low': f'{cur_1}{cur_2}_LOW',
            'Close': f'{cur_1}{cur_2}_CLOSE'
        }, inplace=True)

        f_data['timestamp'] = f_data['timestamp'].apply(lambda x: x.timestamp()).astype(int)
        f_data.set_index('timestamp', inplace=True)
        return f_data

    def update_forex_data(self, old_data_path, save_path='forex_data.feather'):
        old_data = pd.read_feather(old_data_path)
        old_data.set_index('timestamp', inplace=True)

        forex_data = self.download_data('EUR', 'PLN')

        for main in self.main_currencies:
            for add in self.additional_currencies:
                if main == add:
                    continue
                try:
                    temp_data = self.download_data(main, add)
                    forex_data = forex_data.join(temp_data)
                except Exception as e:
                    print(f'Error: {e}, cur1: {main}, cur2:{add}')

        forex_data = pd.concat([forex_data, old_data])

        forex_data.reset_index(inplace=True)
        forex_data.drop_duplicates(subset=['timestamp'], inplace=True)
        forex_data.sort_index(inplace=True)
        
        forex_data.to_feather(save_path)

In [4]:
forex_path = '/kaggle/input/forex-data-downloader/forex_data.feather'

handler = ForexDataHandler()
handler.update_forex_data(forex_path)

Error: 'timestamp', cur1: PLN, cur2:HUF


# Economic calendar

In [None]:
class EconomicDataHandler:
    def __init__(
        self,
        start_date: str = "2020-01-01",
        end_date: pd.Timestamp = pd.Timestamp.today().normalize(),
        chunk_days: int = 30,
        save_path: str = None
    ):
        self.start_date = pd.to_datetime(start_date)
        self.end_date = end_date
        self.chunk_delta = pd.Timedelta(days=chunk_days)
        self.save_path = save_path
        self.date_ranges = self._generate_date_ranges()
        self.clean_chunks = []

    def _generate_date_ranges(self):
        # Split the full interval into successive (start, end) pairs
        ranges = []
        curr = self.start_date
        
        while curr <= self.end_date:
            end = min(curr + self.chunk_delta, self.end_date)
            ranges.append((curr, end))
            curr = curr + self.chunk_delta

        return ranges

    def _fetch_range(self, start: pd.Timestamp, to: pd.Timestamp) -> pd.DataFrame:
        # Download raw events JSON and convert to DataFrame.
        url = "https://economic-calendar.tradingview.com/events"
        params = {
            "from": start.date().isoformat(), 
            "to": to.date().isoformat()
            }
        headers = {"Origin": "https://www.tradingview.com"}

        resp = requests.get(url, headers=headers, params=params)
        resp.raise_for_status()
        data = resp.json().get("result", [])

        return pd.DataFrame(data)

    def _clean_df(self, df: pd.DataFrame) -> pd.DataFrame:

        # Fill missing values in referenceDate and create new "Timestamp" column
        df["referenceDate"] = df["referenceDate"].fillna(df["date"])
        df['referenceDate'] = pd.to_datetime(df["referenceDate"], format='mixed', yearfirst=True)
        df['timestamp'] = df['referenceDate'].apply(lambda x: x.timestamp()).astype(int)

        # Drop unimportant columns, duplicates and rows without crucial data
        calendar_drop = ['id', 'period', 'source', 'ticker', 'scale', 'category',
                            'actualRaw', 'previousRaw', 'forecastRaw', 'source_url'
                        ]
        df.drop(calendar_drop, axis=1, inplace=True)
        df.dropna(subset=['actual'], inplace=True)
        df.drop_duplicates(subset=['title', 'date', 'indicator', 'country', 'referenceDate', 'actual'], inplace=True, keep='last')

        return df

    def download(self) -> pd.DataFrame:
        # Main entry: fetch all data, clean and save if requested
        for start, to in tqdm(self.date_ranges, desc="Downloading chunks"):
            try:
                raw = self._fetch_range(start, to)
                clean = self._clean_df(raw)
                self.clean_chunks.append(clean)
            except Exception as e:
                print(f"Error fetching {start.date()}→{to.date()}: {e}")

        data = pd.concat(self.clean_chunks, ignore_index=True)
        data.reset_index(inplace=True, drop=True)

        if self.save_path:
            data.to_feather(self.save_path)
        return data

In [None]:
Economic_handler = EconomicDataHandler()
economic_data = Economic_handler.download()

# EDA

In [None]:
forex_data = pd.read_feather('/kaggle/input/forex-data-gatherer/forex_data.feather')

In [None]:
#Economic calendar d.aggregateta
economic_df = pd.read_feather('/kaggle/input/economic-calendar-data/economic_data.feather')

In [None]:
economic_poland = economic_df[economic_df['country']=='PL'].copy()
economic_poland.reset_index(inplace=True)
economic_poland.drop('index', inplace=True, axis=1)

In [None]:
# fix some data(sometimes records from previous months are saved next month as duplicates)
economic_poland = economic_df[economic_df['country']=='PL'].copy()

mask = economic_poland.duplicated(subset=['date', 'title'], keep=False)
for idx, [index, row] in enumerate(economic_poland.loc[mask].iterrows()):
    try:
        pair = economic_poland.loc[mask].iloc[idx+1]
        if all(row[['date', 'title']] == pair[['date', 'title']]):
            if row['actual'] == pair['previous']:
                new_date = pd.to_datetime(row['date'])
                time_diff = pd.to_datetime(economic_poland[economic_poland['title'] == row['title']]['date']).diff().dt.days.median()
                new_date = new_date - pd.Timedelta(days=time_diff)
                economic_poland.loc[[index], ['date']] = new_date.strftime('%Y-%m-%d')
            else:
                new_date = pd.to_datetime(pair['date'])
                time_diff = pd.to_datetime(economic_poland[economic_poland['title'] == pair['title']]['date']).diff().dt.days.median()
                new_date = new_date - pd.Timedelta(days=time_diff)
                economic_poland.loc[[pair.name], ['date']] = new_date.strftime('%Y-%m-%d')
    except Exception as e:
        continue
        #print(e)

In [None]:
#Create new df with continues range of dates and all indicator values in any day

df_pivot = economic_poland.pivot(columns='title', values='actual')

#merge tables to update dates
merged = df_pivot.merge(economic_poland, left_index=True, right_index=True)
merged.drop(['title', 'country', 'indicator', 'comment','actual', 'previous', 'forecast', 'importance'], axis=1, inplace=True)
merged.reset_index(inplace=True)

#create continues dates from oldest to newest 
idx = pd.date_range(merged.date.min(), merged.date.max())
idx = idx.strftime('%Y-%m-%d')

#create new dataframe with full set of date range
new_df = pd.DataFrame(index=idx, columns=merged.columns)
new_df.drop('date', axis=1, inplace=True)

# change date column to str to be merge
merged['date'] = merged['date'].apply(lambda x: x.strftime('%Y-%m-%d'))

#merge both dataframes table with values in full set of ranges
fullset = pd.merge(new_df, merged, how='left', right_on='date', left_index=True, suffixes=("_x", None))
fullset.dropna(axis=1, how='all', inplace=True)

#fill all nans with values from previous rows(newest)
fullset = fullset.ffill()

#drop duplicates
fullset.drop_duplicates(subset=['date'], inplace=True, keep='last')
fullset.set_index('date', inplace=True)
fullset.drop('index', inplace=True, axis=1)

#fill rest of nan values(oldest data) with oldest 'previous' value from main df - check if it is not bettter to leave nan
for col in fullset.columns:
    if pd.isna(fullset[col].iloc[0]):
        value = economic_poland.loc[economic_poland['title'] == col].iloc[0]['previous']
        fullset[col] = fullset[col].fillna(value)
        
#check if last row in our dataframe is correct
test_df = pd.DataFrame()
for title in economic_poland['title'].unique():
    test_df[title] = [economic_poland[economic_poland["title"]==title].iloc[-1]["actual"]]
    
test_true = fullset.drop('timestamp', axis=1).iloc[-1] == test_df
print(test_true.iloc[0].unique()) #it should only contain "True" values

In [None]:
fullset

In [None]:
error = []
for title in economic_poland.title.unique():
    all_rows = economic_poland[economic_poland.title == title]
    for index, [idx, row] in enumerate(all_rows.iterrows()):
        try:
            now = row['date']
            until = pd.to_datetime(all_rows.iloc[index+1]['date'])
            until = until - pd.Timedelta(days=1)
            until = until.strftime('%Y-%m-%d')
        except:
            pass
        if now > until:
            temp1 = now
            now = until
            until = temp1
        actual = row['actual']
        if all(actual != fullset.loc[now:until][indicator]):
            error.append([now, until, actual, fullset.loc[now][indicator], indicator])

In [None]:
#merge both dataframes table with values in full set of ranges
fullset2 = pd.merge(new_df, merged, how='left', right_on='date', left_index=True, suffixes=("_x", None))
fullset2.dropna(axis=1, how='all', inplace=True)
fullset2.set_index('date', inplace=True)

In [None]:
fullset['Inflation Rate YoY Final']['2016-03-01':'2016-03-30']

In [None]:
mask = economic_poland.duplicated(subset=['date', 'title'], keep=False)
economic_poland.loc[mask]

In [None]:
economic_poland[economic_poland['date'] == '2016-03-15']

In [None]:
fullset2['Inflation Rate YoY Final'].dropna()[0:20]

In [None]:
fullset2['Inflation Rate YoY Final']['2016-03-01':'2016-03-30']

In [None]:
#Forex data
forex_df = pd.read_excel('/kaggle/input/forex-data-gatherer/FOREX_DATA.xlsx')

forex_df['Datetime'] = pd.to_datetime(forex_df['Datetime'], dayfirst=True)
forex_df = dataframe.sort_values(by='Datetime', ascending=False)
forex_df['Datetime'] = forex_df["Datetime"].dt.strftime('%d-%m-%Y %H:%M:%S %z')
forex_df.set_index('Datetime', inplace=True)

In [None]:
join_df = dataframe.join(calendar_df).drop_duplicates()
join_df = join_df[~join_df.index.duplicated(keep='first')]

In [None]:
join_df.loc['28-03-2023 07:00:00 +0100']