# Data Preparation

##### imports

In [16]:
import os
import pandas as pd
import numpy as np

# Import patterns DataFrame

In [17]:
pattern_df = pd.read_csv('data/patterns.csv')
pattern_df.head()

Unnamed: 0,Ticker,Company,Pattern,Start Date,End Date,Width,API Start Date,API End Date,Breakout Date,Have Data
0,SNK,SANOK (SNK),h&s_top,2023-06-18,2023-08-15,58,2022-12-26,2024-02-05,,True
1,GTN,GETIN (GTN),vertical_run_up,2023-07-21,2023-08-03,13,2023-06-12,2023-09-11,,True
2,PEO,PEKAO (PEO),h&s_top,2023-07-14,2023-08-30,47,2023-02-23,2024-01-18,,True
3,SKH,SKARBIEC (SKH),rising_wedge,2023-07-01,2023-08-29,59,2023-01-05,2024-02-22,,True
4,SPL,SANPL (SPL),h&s_top,2023-07-05,2023-08-30,56,2023-01-18,2024-02-14,,True


In [18]:
pattern_df.Pattern.value_counts()

Pattern
rising_wedge           651
double_top             496
double_bottom          392
uptrend                357
falling_wedge          317
triangle               205
vertical_run_up        203
downtrend              172
descending_triangle    120
ascending_triangle     113
h&s_top                108
support                 97
resistance              96
flat_base               95
ascending_channel       80
inverted_triangle       74
pennant                 63
h&s_bottom              50
descending_channel      49
flag                    28
horizontal_channel       4
Name: count, dtype: int64

# Prepare DataFrames

In [19]:
type(pattern_df['Start Date'].iloc[0])

str

In [118]:
def extract_info(filename):
    index_num = int(filename.split('_')[0])
    
    row = pattern_df.iloc[index_num] 
    
    if not row.empty:
        start_date = row['Start Date']
        end_date = row['End Date']
        pattern = row['Pattern']
        
        return start_date, end_date, pattern


def start_date(df):
    start_date = df['Start Date'].iloc[0]
   
    while df[df.Date ==str(start_date).replace(" 00:00:00","")].shape[0] == 0:
        start_date -= pd.Timedelta(days=1)
        
   
    start = df[df.Date == str(start_date).replace(" 00:00:00","")].index[0]
        
    return start

def end_date(df):
    end_date = df['End Date'].iloc[0]
   
    while df[df.Date ==str(end_date).replace(" 00:00:00","")].shape[0] == 0:
        end_date += pd.Timedelta(days=1)
        
   
    end = df[df.Date == str(end_date).replace(" 00:00:00","")].index[0]
        
    return end
    
    
def process_csv(filename):
    start, end, pattern = extract_info(filename)     
    
    path_to_file = f'data/patterns/{pattern}/{filename}'
    df = pd.read_csv(path_to_file)
    

    if 'Volume' in df.columns and 'Adjusted_close' in df.columns:
        cols_to_drop = ['Adjusted_close', 'Volume']
        df.drop(columns=cols_to_drop, inplace=True)
    
    df['Start Date'] = start
    df['End Date'] = end
    df['Pattern'] = pattern
    
    df['Date'] = pd.to_datetime(df['Date'])
    df['Start Date'] = pd.to_datetime(df['Start Date'])
    df['End Date'] = pd.to_datetime(df['End Date'])
    
    patterns_dict = {
        "rising_wedge": 1,
        "falling_wedge": 2,
        "double_top": 3,
        "double_bottom": 4
    }
    df['Pattern'] = df['Pattern'].map(patterns_dict)
    
    start = start_date(df)
    df['Start Date'] = start
    end = end_date(df)
    df['End Date'] = end

    
    df.to_csv(path_to_file, index=False)
    print(f'{path_to_file} saved')
    return df


###  Convert Dates to indices