# SP500 tickers

In [None]:
# tickers of SP500
# install a linter
# Ticker obtained from public source: https://www.slickcharts.com/sp500; accessed on 4 December 2023
# This ticker list might change in the future. How can I scrape the data automatircally to compare it is still valid?

# Decoding Market Signals: Candlestick patterns

In [None]:
https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3247865

## Third party libraries

In [21]:
import os
import numpy as np
import pandas as pd

import talib 

In [22]:
pd.set_option('display.max_columns', None)

In [105]:
# Include reference: Scientific Guide To Price Action and Pattern Trading Wisdom of Trend, Cycle, and Fractal Wave (Young Ho Seo)

![cs_anatomy](./figures/cs_anatomy.png)

![cs_prediction](./figures/cs_prediction.png)

# Obtaining high-quality price data for stocks

Imperial College offers access to the Wharton Research Data Dervice (WRDS). One can either use the computers in the library or apply for an account. Please see the following link for details:
https://www.imperial.ac.uk/admin-services/library/subject-support/business/wrds/

In case we have login credentials for WRDS, we can submit queries to their data base via
https://wrds-www.wharton.upenn.edu/pages/get-data/center-research-security-prices-crsp/

Now, from within WRDS, we can access stock market data from the Center for Research in Security Analysis (CRSP). WRDS in general and CRSP in particular offer a wide range of financial data, as can be seen from the below figure. For our project, we are explicitely interested in stock market data of S&P 500 components in daily frequency. 

![Wharton WRDS CRSP Data](./figures/Wharton_WRDS_CRSP_data.png)

# Loading and pre-processing the data

In [23]:
# what columns are available from the Wharton CRSP data?
pd.read_csv('./data/MSFT.csv').columns

Index(['PERMNO', 'date', 'NAMEENDT', 'SHRCD', 'EXCHCD', 'SICCD', 'NCUSIP',
       'TICKER', 'COMNAM', 'SHRCLS', 'TSYMBOL', 'NAICS', 'PRIMEXCH', 'TRDSTAT',
       'SECSTAT', 'PERMCO', 'ISSUNO', 'HEXCD', 'HSICCD', 'CUSIP', 'DCLRDT',
       'DLAMT', 'DLPDT', 'DLSTCD', 'NEXTDT', 'PAYDT', 'RCRDDT', 'SHRFLG',
       'HSICMG', 'HSICIG', 'DISTCD', 'DIVAMT', 'FACPR', 'FACSHR', 'ACPERM',
       'ACCOMP', 'SHRENDDT', 'NWPERM', 'DLRETX', 'DLPRC', 'DLRET', 'TRTSCD',
       'NMSIND', 'MMCNT', 'NSDINX', 'BIDLO', 'ASKHI', 'PRC', 'VOL', 'RET',
       'BID', 'ASK', 'SHROUT', 'CFACPR', 'CFACSHR', 'OPENPRC', 'NUMTRD',
       'RETX', 'vwretd', 'vwretx', 'ewretd', 'ewretx', 'sprtrn'],
      dtype='object')

We can make use of a technique called `method chaining` to pipe multiple processing steps within Pandas. This offers two advantages
 - We start from the raw data as input statement initially. In case the data is small enough to be processed in-memory, we do not require to store intermediary results on disk which confuses transparency and is less efficient.
 - Increased code readability and transparency for debugging.

In [82]:
df = (
    pd.read_csv('./data/MSFT.csv', usecols=['date', 'OPENPRC', 'ASKHI', 'BIDLO', 'PRC', 'VOL'])
    .rename(columns={'OPENPRC': 'open', 'ASKHI': 'high', 'BIDLO': 'low', 'PRC': 'close', 'VOL':'vol'})
    .dropna()
    .reset_index()
    .assign(date=lambda df: pd.to_datetime(df['date'], format='%Y-%m-%d'),
            vol=lambda df: df['vol'].astype(int),
            intraday_return=lambda df: (df['close'] - df['open']) / df['open'],
            sign_intraday_return=lambda df: np.sign(df['intraday_return']).astype(int))
    [['date', 'open', 'high', 'low', 'close', 'vol', 'intraday_return', 'sign_intraday_return']]
#     .set_index('date')
)

df

Unnamed: 0,date,open,high,low,close,vol,intraday_return,sign_intraday_return
0,1992-06-15,75.00000,78.00000,75.00000,75.75000,1707931,0.010000,1
1,1992-06-16,75.50000,76.50000,72.50000,72.75000,1800280,-0.036424,-1
2,1992-06-17,72.50000,73.25000,71.50000,72.37500,2196679,-0.001724,-1
3,1992-06-18,73.00000,74.75000,71.00000,72.25000,1706769,-0.010274,-1
4,1992-06-19,73.50000,74.25000,72.75000,73.50000,1350828,0.000000,0
...,...,...,...,...,...,...,...,...
7685,2022-12-23,236.11000,238.87000,233.94279,238.73000,21287448,0.011097,1
7686,2022-12-27,238.70000,238.92999,235.83000,236.96001,16666893,-0.007289,-1
7687,2022-12-28,236.89000,239.72000,234.17000,234.53000,17447853,-0.009962,-1
7688,2022-12-29,235.64999,241.92000,235.64999,241.00999,19727159,0.022746,1


In [83]:
df['previous_day_return'] = df['sign_intraday_return'].shift(+1)#.dropna().astype(int)
df['log_return'] = np.log(df['close'] / df['close'].shift(-1))
df['5_days_mean_return'] = df['log_return'].rolling(window=5, min_periods=1).mean().shift(-5)  # Idea: Compute 5 days average ahead returns one day after the signal. using pd.rolling and pd.shift.
df = df.dropna()
# df['previous_day_return'] = df['previous_day_return'].astype(int)
df.reset_index(drop=True, inplace=True)  

In [85]:
df.head()

Unnamed: 0,date,open,high,low,close,vol,intraday_return,sign_intraday_return,previous_day_return,log_return,5_days_mean_return
0,1992-06-16,75.5,76.5,72.5,72.75,1800280,-0.036424,-1,1.0,0.005168,0.000346
1,1992-06-17,72.5,73.25,71.5,72.375,2196679,-0.001724,-1,-1.0,0.001729,0.006327
2,1992-06-18,73.0,74.75,71.0,72.25,1706769,-0.010274,-1,-1.0,-0.017153,0.019266
3,1992-06-19,73.5,74.25,72.75,73.5,1350828,0.0,0,-1.0,-0.010152,0.005461
4,1992-06-22,73.25,74.5,71.5,74.25,1709631,0.013652,1,0.0,0.010152,0.009758


# What candlestick patterns are available?

In [86]:
candle_names = talib.get_function_groups()['Pattern Recognition']
list_of_results_df = []
# for ListingId in df['ListingId'].unique():
#     equity_df = df.loc[df['ListingId']==ListingId].reset_index()
    
for signal in candle_names:
    tds = df.copy()
    tds['cs_pattern'] = getattr(talib, signal)(df['open'], df['high'], df['low'], df['close'])
    tds['signal'] = signal
    list_of_results_df.append(tds.set_index(['signal','date']))
        
result_df = pd.concat(list_of_results_df)

In [107]:
result_df

Unnamed: 0_level_0,Unnamed: 1_level_0,open,high,low,close,vol,intraday_return,sign_intraday_return,previous_day_return,log_return,5_days_mean_return,cs_pattern
signal,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
CDL2CROWS,1992-06-16,75.50000,76.50000,72.50000,72.75000,1800280,-0.036424,-1,1.0,0.005168,0.000346,0
CDL2CROWS,1992-06-17,72.50000,73.25000,71.50000,72.37500,2196679,-0.001724,-1,-1.0,0.001729,0.006327,0
CDL2CROWS,1992-06-18,73.00000,74.75000,71.00000,72.25000,1706769,-0.010274,-1,-1.0,-0.017153,0.019266,0
CDL2CROWS,1992-06-19,73.50000,74.25000,72.75000,73.50000,1350828,0.000000,0,-1.0,-0.010152,0.005461,0
CDL2CROWS,1992-06-22,73.25000,74.50000,71.50000,74.25000,1709631,0.013652,1,0.0,0.010152,0.009758,0
...,...,...,...,...,...,...,...,...,...,...,...,...
CDLXSIDEGAP3METHODS,2022-12-16,248.55000,249.84000,243.50999,244.69000,94651273,-0.015530,-1,-1.0,0.017480,0.002924,0
CDLXSIDEGAP3METHODS,2022-12-19,244.86000,245.21001,238.71001,240.45000,29634558,-0.018010,-1,-1.0,-0.005599,0.006105,0
CDLXSIDEGAP3METHODS,2022-12-20,239.39999,242.91000,238.42000,241.80000,25126887,0.010025,1,-1.0,-0.010818,0.002818,0
CDLXSIDEGAP3METHODS,2022-12-21,241.69000,245.61501,240.11000,244.42999,23639790,0.011337,1,1.0,0.025860,-0.001364,0


## How to visually inspect the signals

In [100]:
import plotly.graph_objects as go

def plot_cs_chart(df: pd.DataFrame, ListingId=None) -> go.Figure:
    if ListingId is None:
        fig = go.Figure(data=[go.Candlestick(x=df.index,
                    open=df['open'],
                    high=df['high'],
                    low=df['low'],
                    close=df['close']
                    )])

        fig.update_layout(
            font=dict(size=18),
            legend=dict(yanchor="top", y=0.99, xanchor="right", x=0.99,font=dict(size= 20)),
            title = 'Evolution of price chart diplayed as candlestick chart',
            xaxis_title='time [days]',
            yaxis_title="price",
            autosize=False,
            width=800,
            height=800
            )

        fig.show()
    
    elif ListingId is not None:
        fig = go.Figure(data=[go.Candlestick(x=df['Date'][df['ListingId']==ListingId],
                        open=df['open'][df['ListingId']==ListingId],
                        high=df['high'][df['ListingId']==ListingId],
                        low=df['low'][df['ListingId']==ListingId],
                        close=df['close'][df['ListingId']==ListingId]
                        )])

        fig.update_layout(
            font=dict(size=18),
            legend=dict(yanchor="top", y=0.99, xanchor="right", x=0.99,font=dict(size= 20)),
            title = 'Evolution of price chart diplayed as candlestick chart',
            xaxis_title='time [days]',
            yaxis_title="price",
            autosize=False,
            width=800,
            height=800
            )

        fig.show()
        
    else:
        print('Do not understand plotting instruction. Check data frame and ListingId')

In [102]:
plot_cs_chart(result_df.loc['CDLDOJI'].reset_index().set_index('date')[:20])

In [98]:

TP  FP
FN  TN


TPR = TP / (TP+TN) 

Accuracy = ... > 50%
Precision = ...

In [None]:
# Account curve 

compare account curve to a benchmark
cumprod(predictio_t * return_t )

benachmark: SP500 

In [None]:
Logistic regression 

In [None]:
151 alphas
https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3247865

In [None]:
45 hours my time;  
28 hours with software engineering 

Until 15 December: Get stuff on github.
Meeting with Jay: 2pm-3pm

James is available on Teams; working hours: 9:00-5:00. 
TODO: Set up a timeline with goals and achievements each week; get code as soon as possible to gitbug to use branches, pull requests, etc.

What James can help with:
Priority: Woring in branches on github:
Pull-Requests on github (Python package index; upload .ipynb to HPC)

2) Modularize the code using classes/OOP/Design Patterns.

3) Writing Unit-Test

4)CI test suits

5) Storage of data (backend)

use black as a linter
include literature references

# Performance evaluation using confusion matrix

In [None]:
doji_df = result_df.loc['CDLDOJI']
doji_df.head()

# conditions for classical doji
doji_df['TruePositive'] = np.where((doji_df['cs_pattern']>0) & (doji_df['previous_day_return']>0) & (doji_df['close'].shift(+1)<(doji_df['close'])) & (doji_df['NextDayOpen']<doji_df['Close|Executed']) & (doji_df['NextDayReturn']<0), 1,
                            np.where((doji_df['cs_pattern']>0) & (doji_df['previous_day_return']<0) & (doji_df['close'].shift(+1)>(doji_df['close'])) &(doji_df['NextDayOpen']>doji_df['Close|Executed']) & (doji_df['NextDayReturn']>0), 1,
                            0))

doji_df['FalsePositive'] = np.where((doji_df['cs_pattern']>0) & (doji_df['previous_day_return']>0) & (doji_df['close'].shift(+1)<(doji_df['close'])) & (doji_df['NextDayOpen']<doji_df['Close|Executed']) & (doji_df['NextDayReturn']>0), 1,
                            np.where((doji_df['cs_pattern']>0) & (doji_df['previous_day_return']<0) & (doji_df['close'].shift(+1)>(doji_df['close'])) &(doji_df['NextDayOpen']>doji_df['Close|Executed']) & (doji_df['NextDayReturn']<0), 1,
                            0))

doji_df['TruePositive5'] = np.where((doji_df['cs_pattern']>0) & (doji_df['previous_day_return']>0) & (doji_df['close'].shift(+1)<(doji_df['close'])) & (doji_df['NextDayOpen']<doji_df['Close|Executed']) & (doji_df['5DaysMeanReturn']<0), 1,
                            np.where((doji_df['cs_pattern']>0) & (doji_df['previous_day_return']<0) & (doji_df['close'].shift(+1)>(doji_df['close'])) &(doji_df['NextDayOpen']>doji_df['Close|Executed']) & (doji_df['5DaysMeanReturn']>0), 1,
                            0))

doji_df['FalsePositive5'] = np.where((doji_df['cs_pattern']>0) & (doji_df['previous_day_return']>0) & (doji_df['close'].shift(+1)<(doji_df['close'])) & (doji_df['NextDayOpen']<doji_df['Close|Executed']) & (doji_df['5DaysMeanReturn']>0), 1,
                            np.where((doji_df['cs_pattern']>0) & (doji_df['previous_day_return']<0) & (doji_df['close'].shift(+1)>(doji_df['close'])) &(doji_df['NextDayOpen']>doji_df['Close|Executed']) & (doji_df['5DaysMeanReturn']<0), 1,
                            0))

doji_df['Signal'] = 'doji'

# Appendix

In [None]:
import talib

def compute_intraday_return(row):
    row['positive_intraday'] = np.where((row['Close|Executed'] - row['Open|Executed']) > 0, True, False)
    return row

cs_performance_dict = dict()

for ListingId in df['ListingId'].unique():
    equity_df = pd.DataFrame(index=df[df['ListingId']==ListingId]['Date'], 
                                data=df[df['ListingId']==ListingId][['Open|Executed', 'High|Executed', 'Low|Executed', 'Close|Executed']].values, 
                                columns = df[df['ListingId']==ListingId][['Open|Executed', 'High|Executed', 'Low|Executed', 'Close|Executed']].columns
                            )
    equity_df = equity_df.apply(compute_intraday_return, axis='columns')
    
    results = []
    cols = []
    for attr in dir(talib):
        if attr[:3]=='CDL':
            res = getattr(talib, attr)(equity_df['Open|Executed'], equity_df['High|Executed'], equity_df['Low|Executed'], equity_df['Close|Executed'])
            results.append(res)
            cols.append(attr)

    patterns = pd.DataFrame(results).T
    patterns.columns = cols
    
    signal_df = patterns[(patterns.select_dtypes(include=['number']) != 0).any(1)]  # alternative: patterns.loc[(patterns.loc[:, patterns.dtypes != object] != 0).any(1)]
    
    signal_and_return_df = signal_df.sum(axis=1).rename('cumulative_signal').to_frame().merge(equity_df['positive_intraday'].shift(-1), left_index=True, right_index=True)[:-1]  # dont count the last as its return will be NaN
    signal_and_return_df = signal_and_return_df.loc[signal_and_return_df['cumulative_signal'] != 0]
    signal_and_return_df['performance'] = np.where((signal_and_return_df['cumulative_signal']>0) & (signal_and_return_df['positive_intraday']==True), True,
                                                   np.where((signal_and_return_df['cumulative_signal']<0) & (signal_and_return_df['positive_intraday']==False), True, False))
    
    TP = signal_and_return_df['performance'].sum() / (signal_and_return_df['performance'].shape[0] & Where the signaal occurs )  #TODO modify the ratio of TP/FP
    FP = 1-TP                                                                                      # 
    
    cs_performance_dict[ListingId] = {'TP_rate':round(TP, 4), 'FP_rate':round(FP, 4)}