# Data Sourcing

## Imports

In [39]:
from datetime import datetime
import os
import pandas as pd
import requests
from io import StringIO
import time

import plotly.graph_objects as go

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.ui import WebDriverWait

## Web Scraping with Selenium

To conduct pattern analysis, the first thing to acquire is graphs that show patterns.

In order to do this, we will use Selenium to scrape the Atmatix website (www.atmatix.pl)

### Functions

### Run scraping

### Create a DataFrame from scraped data

### Save DF

## DataFrame 

From this cell on, all work is complete in the Notebook

### Load Dataframe

In [2]:
df = pd.read_csv('data/patterns.csv')

### Inspecting Dataframe

In [3]:
display(f'The Database has {df.shape[0]} rows and {df.shape[1]} columns.')
display('==================================================================================================================')
display(f'The Database has the following columns: {list(df.columns)}')
display('==================================================================================================================')
display(f"The database is missing {df['Company'].isna().sum()} Company names. That is {((df['Company'].isna().sum() / df.shape[0]) * 100):.2f}%")
display('==================================================================================================================')
display(f"The database is missing {df['Pattern'].isna().sum()} Patterns. That is {((df['Pattern'].isna().sum() / df.shape[0]) * 100):.2f}%")
display('==================================================================================================================')
display(f"The database is missing {df['Width'].isna().sum()} Widths. That is {((df['Width'].isna().sum() / df.shape[0]) * 100):.2f}%")
display('==================================================================================================================')
display(f"The database is missing {df['Breakout Date'].isna().sum()} Breakout dates. That is {((df['Breakout Date'].isna().sum() / df.shape[0]) * 100):.2f}%")
display('==================================================================================================================')
display(f"The database is missing {df['End Date'].isna().sum()} End dates. That is {((df['End Date'].isna().sum() / df.shape[0]) * 100):.2f}%")

'The Database has 3770 rows and 5 columns.'



"The Database has the following columns: ['Company', 'Pattern', 'Width', 'End Date', 'Breakout Date']"



'The database is missing 0 Company names. That is 0.00%'



'The database is missing 0 Patterns. That is 0.00%'



'The database is missing 0 Widths. That is 0.00%'



'The database is missing 2046 Breakout dates. That is 54.27%'



'The database is missing 0 End dates. That is 0.00%'

### Adding a Ticker column

In [4]:
ticker_with_parentheses = df['Company'].str.extract(r'(?<=\()(\w+)(?=\))')
ticker_without_parentheses = df['Company'].str.extract(r'^(\w+)$')

df['Ticker'] = ticker_with_parentheses.fillna(ticker_without_parentheses)
df.head(3)

Unnamed: 0,Company,Pattern,Width,End Date,Breakout Date,Ticker
0,SANOK (SNK),H&S top,58,8/15/2023,,SNK
1,GETIN (GTN),Vertical run up,13,8/3/2023,,GTN
2,PEKAO (PEO),H&S top,47,8/30/2023,,PEO


### Add API Start Date and API End Date column

In [5]:
df['End Date'] = pd.to_datetime(df['End Date'])

df['API End Date'] = df.apply(lambda row: row['End Date'] + pd.DateOffset(days=row['Width'] * 3), axis=1)
df['API Start Date'] = df.apply(lambda row: row['End Date'] - pd.DateOffset(days=row['Width'] * 4), axis=1)


df.head(3)

Unnamed: 0,Company,Pattern,Width,End Date,Breakout Date,Ticker,API End Date,API Start Date
0,SANOK (SNK),H&S top,58,2023-08-15,,SNK,2024-02-05,2022-12-26
1,GETIN (GTN),Vertical run up,13,2023-08-03,,GTN,2023-09-11,2023-06-12
2,PEKAO (PEO),H&S top,47,2023-08-30,,PEO,2024-01-18,2023-02-23


In [6]:
df['API End Date'] = pd.to_datetime(df['API End Date'])
df['API Start Date'] = pd.to_datetime(df['API Start Date'])

## Using EOD HD API to retrieve stock information

## Inspecting New Data

### Add 'Have data' column

In [11]:
for index, row in df.iterrows():
    ticker = row.Ticker
    id_num = index
    pattern = '_'.join(word.lower() for word in row.Pattern.split())
    folder_name = f'data/patterns/{pattern}'
    
    f_name = f'{index}_{ticker}_{pattern}.csv'
    f_path = os.path.join(folder_name, f_name)
    
    if os.path.exists(f_path):
        df.at[index, 'Have data'] = True
    else:
        df.at[index, 'Have data'] = False
        
df.head(3)

Unnamed: 0,Company,Pattern,Width,End Date,Breakout Date,Ticker,API End Date,API Start Date,Have data
0,SANOK (SNK),H&S top,58,2023-08-15,,SNK,2024-02-05,2022-12-26,True
1,GETIN (GTN),Vertical run up,13,2023-08-03,,GTN,2023-09-11,2023-06-12,True
2,PEKAO (PEO),H&S top,47,2023-08-30,,PEO,2024-01-18,2023-02-23,True


### Missing data

In [12]:
missing_data = df[df['Have data'] == False]
display(f'There are {missing_data.shape[0]} rows of missing data')
display(f'There are {df.shape[0] - missing_data.shape[0]} completed DataFrames.')
display(missing_data.head())
display(missing_data['Ticker'].value_counts())

'There are 631 rows of missing data'

'There are 3139 completed DataFrames.'

Unnamed: 0,Company,Pattern,Width,End Date,Breakout Date,Ticker,API End Date,API Start Date,Have data
12,WIG-CHEMIA,Double bottom,60,2023-08-28,,,2024-02-24,2022-12-31,False
24,WIG-NRCHOM,Double top,11,2023-08-02,,,2023-09-04,2023-06-19,False
29,WIG-ENERG,H&S top,10,2023-08-30,,,2023-09-29,2023-07-21,False
44,WIG-POLAND,Flag,25,2023-08-11,,,2023-10-25,2023-05-03,False
48,WIG-NRCHOM,Pennant,38,2023-08-22,,,2023-12-14,2023-03-23,False


Ticker
WIGDIV        42
MWIG40        39
WIG20         35
FW20          30
WIG20SHORT    29
Name: count, dtype: int64

In [35]:
eod_api = '64ef583e9a7137.71172021'
eod_base = 'https://eodhistoricaldata.com/api/eod/'
exchange = 'INDX'

def time_to_string(date):
    return date.strftime('%Y-%m-%d')


for index, row in df.iterrows():
    if row['Have data'] == False:
        
        if pd.isna(row['Ticker']):
            symbol = row['Company']
        else:
            symbol = row['Ticker']
            
        from_date = time_to_string(row['API Start Date'])
        to_date = time_to_string(row['API End Date'])

        url = f'{eod_base}{symbol}.{exchange}?'

        params = {
            'api_token': eod_api,
            'period': 'd',
            'from': from_date,
            'to': to_date
        }

        response = requests.get(url, params=params)

        if response.status_code == 200:
            csv_content = response.text

            sub_df = pd.read_csv(StringIO(csv_content))

            pattern = row['Pattern'].replace(" ", "_").lower()
            dataframe_name = f"{index}_{symbol}_{pattern}.csv"


            sub_df.to_csv(dataframe_name, index=False)
            print(f"Saved DataFrame '{dataframe_name}'")
        else:
            print(f"API request for index {index} was not successful.")

Saved DataFrame '12_WIG-CHEMIA_double_bottom.csv'
Saved DataFrame '24_WIG-NRCHOM_double_top.csv'
Saved DataFrame '29_WIG-ENERG_h&s_top.csv'
Saved DataFrame '44_WIG-POLAND_flag.csv'
Saved DataFrame '48_WIG-NRCHOM_pennant.csv'
Saved DataFrame '50_WIG-POLAND_pennant.csv'
Saved DataFrame '57_WIG-POLAND_desc._broad._wedge.csv'
Saved DataFrame '58_WIG-POLAND_falling_wedge.csv'
Saved DataFrame '61_WIG-NRCHOM_triangle.csv'
Saved DataFrame '81_MWIG40_uptrend.csv'
Saved DataFrame '135_WIG-CHEMIA_falling_wedge.csv'
API request for index 144 was not successful.
Saved DataFrame '146_WIG-POLAND_descending_channel.csv'
API request for index 156 was not successful.
API request for index 157 was not successful.
Saved DataFrame '164_WIG20_3_rising_valleys.csv'
API request for index 180 was not successful.
Saved DataFrame '183_WIG-POLAND_flag.csv'
API request for index 189 was not successful.
API request for index 191 was not successful.
API request for index 203 was not successful.
API request for index

API request for index 1070 was not successful.
API request for index 1073 was not successful.
Saved DataFrame '1076_WIGDIV_double_top.csv'
Saved DataFrame '1078_MWIG40_double_top.csv'
API request for index 1082 was not successful.
API request for index 1103 was not successful.
API request for index 1104 was not successful.
API request for index 1105 was not successful.
API request for index 1111 was not successful.
API request for index 1132 was not successful.
API request for index 1140 was not successful.
Saved DataFrame '1150_WIG-NRCHOM_double_top.csv'
Saved DataFrame '1164_WIG-NRCHOM_rising_wedge.csv'
API request for index 1168 was not successful.
API request for index 1174 was not successful.
Saved DataFrame '1177_MWIG40_double_top.csv'
API request for index 1178 was not successful.
API request for index 1181 was not successful.
API request for index 1182 was not successful.
API request for index 1183 was not successful.
API request for index 1186 was not successful.
API request f

Saved DataFrame '2113_WIG-NRCHOM_rising_wedge.csv'
Saved DataFrame '2115_WIGDIV_asc._broad._wedge.csv'
Saved DataFrame '2124_WIG-NRCHOM_asc._broad._wedge.csv'
API request for index 2127 was not successful.
Saved DataFrame '2132_WIG-NRCHOM_asc._broad._wedge.csv'
Saved DataFrame '2133_WIGDIV_double_top.csv'
Saved DataFrame '2135_WIG-NRCHOM_ascending_channel.csv'
API request for index 2137 was not successful.
API request for index 2138 was not successful.
API request for index 2146 was not successful.
API request for index 2151 was not successful.
API request for index 2153 was not successful.
API request for index 2154 was not successful.
Saved DataFrame '2155_WIG-NRCHOM_double_top.csv'
API request for index 2157 was not successful.
API request for index 2158 was not successful.
Saved DataFrame '2166_WIG-NRCHOM_ascending_channel.csv'
Saved DataFrame '2179_WIG-ENERG_downtrend.csv'
API request for index 2187 was not successful.
Saved DataFrame '2232_WIG-NRCHOM_3_rising_valleys.csv'
Saved D

API request for index 3208 was not successful.
API request for index 3209 was not successful.
Saved DataFrame '3214_WIGDIV_double_top.csv'
Saved DataFrame '3223_WIG-POLAND_pennant.csv'
API request for index 3228 was not successful.
API request for index 3232 was not successful.
Saved DataFrame '3234_WIG-POLAND_double_top.csv'
Saved DataFrame '3236_WIG-POLAND_double_top.csv'
Saved DataFrame '3237_WIG20SHORT_double_bottom.csv'
Saved DataFrame '3238_WIG-POLAND_asc._broad._wedge.csv'
Saved DataFrame '3240_WIG-POLAND_rising_wedge.csv'
API request for index 3247 was not successful.
API request for index 3251 was not successful.
API request for index 3252 was not successful.
API request for index 3254 was not successful.
API request for index 3256 was not successful.
Saved DataFrame '3257_WIG-ENERG_falling_wedge.csv'
Saved DataFrame '3258_WIG20_vertical_run_up.csv'
API request for index 3260 was not successful.
API request for index 3262 was not successful.
Saved DataFrame '3263_WIG20_rising_

In [40]:
for index, row in df.iterrows():
    ticker = row.Ticker
    id_num = index
    pattern = '_'.join(word.lower() for word in row.Pattern.split())
    folder_name = f'data/patterns/{pattern}'
    
    f_name = f'{index}_{ticker}_{pattern}.csv'
    f_path = os.path.join(folder_name, f_name)
    
    if os.path.exists(f_path):
        df.at[index, 'Have data'] = True
    else:
        df.at[index, 'Have data'] = False
        
df.head(3)

Unnamed: 0,Company,Pattern,Width,End Date,Breakout Date,Ticker,API End Date,API Start Date,Have data
0,SANOK (SNK),H&S top,58,2023-08-15,,SNK,2024-02-05,2022-12-26,True
1,GETIN (GTN),Vertical run up,13,2023-08-03,,GTN,2023-09-11,2023-06-12,True
2,PEKAO (PEO),H&S top,47,2023-08-30,,PEO,2024-01-18,2023-02-23,True


In [41]:
missing_data = df[df['Have data'] == False]
display(f'There are {missing_data.shape[0]} rows of missing data')
display(f'There are {df.shape[0] - missing_data.shape[0]} completed DataFrames.')
display(missing_data.head())
display(missing_data['Ticker'].value_counts())

'There are 486 rows of missing data'

'There are 3284 completed DataFrames.'

Unnamed: 0,Company,Pattern,Width,End Date,Breakout Date,Ticker,API End Date,API Start Date,Have data
12,WIG-CHEMIA,Double bottom,60,2023-08-28,,,2024-02-24,2022-12-31,False
24,WIG-NRCHOM,Double top,11,2023-08-02,,,2023-09-04,2023-06-19,False
29,WIG-ENERG,H&S top,10,2023-08-30,,,2023-09-29,2023-07-21,False
44,WIG-POLAND,Flag,25,2023-08-11,,,2023-10-25,2023-05-03,False
48,WIG-NRCHOM,Pennant,38,2023-08-22,,,2023-12-14,2023-03-23,False


Ticker
FW20    30
Name: count, dtype: int64

## Visualising Patterns

In [46]:
h_and_s_top = pd.read_csv('data/patterns/h&s_top/0_SNK_h&s_top.csv')

fig = go.Figure(data=[go.Candlestick(x=h_and_s_top['Date'],
                open=h_and_s_top['Open'],
                high=h_and_s_top['High'],
                low=h_and_s_top['Low'],
                close=h_and_s_top['Close'])])

fig.show()