In [3]:
# IMPORTS
import numpy as np
import pandas as pd

#Fin Data Sources
import yfinance as yf
import pandas_datareader as pdr

#Data viz
import plotly.graph_objs as go
import plotly.graph_objects as go
import plotly.express as px

import time
from datetime import date

# for graphs
import matplotlib.pyplot as plt

In [1]:
!pip uninstall gdown -y && pip install gdown
!gdown -V

Found existing installation: gdown 5.2.0
Uninstalling gdown-5.2.0:
  Successfully uninstalled gdown-5.2.0
Collecting gdown
  Using cached gdown-5.2.0-py3-none-any.whl.metadata (5.8 kB)
Using cached gdown-5.2.0-py3-none-any.whl (18 kB)
Installing collected packages: gdown
Successfully installed gdown-5.2.0
gdown 5.2.0 at /home/hrithik/anaconda3/lib/python3.11/site-packages


In [4]:
!gdown https://drive.google.com/file/d/1kNWWPi49td0EZhmi6LzNCa2ssC5IUxHP/view?usp=sharing --fuzzy -O ./content/

Downloading...
From (original): https://drive.google.com/uc?id=1kNWWPi49td0EZhmi6LzNCa2ssC5IUxHP
From (redirected): https://drive.google.com/uc?id=1kNWWPi49td0EZhmi6LzNCa2ssC5IUxHP&confirm=t&uuid=811a8a00-68f0-4a4d-b041-eaa8b4b9ae3d
To: /home/hrithik/Hrithik-Stocks-Market-Zoomcamp/homework/hw2/content/stocks_df_combined_2024_05_07.parquet.brotli
100%|████████████████████████████████████████| 119M/119M [00:01<00:00, 61.1MB/s]


In [5]:
# full dataset for 33 stocks
df_full = pd.read_parquet("./content/stocks_df_combined_2024_05_07.parquet.brotli", )

In [6]:
GROWTH = [g for g in df_full.keys() if (g.find('growth_')==0)&(g.find('future')<0)]

In [8]:
# leaving only Volume ==> generate ln(Volume)
OHLCV = ['Open','High','Low','Close','Adj Close_x','Volume']

In [51]:
CATEGORICAL = ['Month', 'month_wom', 'Weekday', 'Ticker', 'ticker_type']

In [21]:
TO_PREDICT = [g for g in df_full.keys() if (g.find('future')>=0)]
TO_PREDICT

['growth_future_5d', 'is_positive_growth_5d_future']

In [52]:
TO_DROP = ['Year','Date','index_x', 'index_y', 'index', 'Quarter','Adj Close_y'] + CATEGORICAL + OHLCV
TO_DROP

['Year',
 'Date',
 'index_x',
 'index_y',
 'index',
 'Quarter',
 'Adj Close_y',
 'Month',
 'month_wom',
 'Weekday',
 'Ticker',
 'ticker_type',
 'Open',
 'High',
 'Low',
 'Close',
 'Adj Close_x',
 'Volume']

In [23]:
# let's define on more custom numerical features
df_full['ln_volume'] = df_full.Volume.apply(lambda x: np.log(x))

  df_full['ln_volume'] = df_full.Volume.apply(lambda x: np.log(x))


In [24]:
# manually defined features
CUSTOM_NUMERICAL = ['SMA10', 'SMA20', 'growing_moving_average', 'high_minus_low_relative','volatility', 'ln_volume']

In [25]:
TECHNICAL_INDICATORS = ['adx', 'adxr', 'apo', 'aroon_1','aroon_2', 'aroonosc',
 'bop', 'cci', 'cmo','dx', 'macd', 'macdsignal', 'macdhist', 'macd_ext',
 'macdsignal_ext', 'macdhist_ext', 'macd_fix', 'macdsignal_fix',
 'macdhist_fix', 'mfi', 'minus_di', 'mom', 'plus_di', 'dm', 'ppo',
 'roc', 'rocp', 'rocr', 'rocr100', 'rsi', 'slowk', 'slowd', 'fastk',
 'fastd', 'fastk_rsi', 'fastd_rsi', 'trix', 'ultosc', 'willr',
 'ad', 'adosc', 'obv', 'atr', 'natr', 'ht_dcperiod', 'ht_dcphase',
 'ht_phasor_inphase', 'ht_phasor_quadrature', 'ht_sine_sine', 'ht_sine_leadsine',
 'ht_trendmod', 'avgprice', 'medprice', 'typprice', 'wclprice']

In [26]:
TECHNICAL_PATTERNS = [g for g in df_full.keys() if g.find('cdl')>=0]
print(f'Technical patterns count = {len(TECHNICAL_PATTERNS)}, examples = {TECHNICAL_PATTERNS[0:5]}')

Technical patterns count = 61, examples = ['cdl2crows', 'cdl3blackrows', 'cdl3inside', 'cdl3linestrike', 'cdl3outside']


In [27]:
MACRO = ['gdppot_us_yoy', 'gdppot_us_qoq', 'cpi_core_yoy', 'cpi_core_mom', 'FEDFUNDS',
 'DGS1', 'DGS5', 'DGS10']

In [28]:
NUMERICAL = GROWTH + TECHNICAL_INDICATORS + TECHNICAL_PATTERNS + CUSTOM_NUMERICAL + MACRO

In [53]:
# CHECK: NO OTHER INDICATORS LEFT
OTHER = [k for k in df_full.keys() if k not in OHLCV + CATEGORICAL + NUMERICAL + TO_DROP]
OTHER

['growth_future_5d', 'is_positive_growth_5d_future']

In [30]:
# truncated df_full with 25 years of data (and defined growth variables)
df = df_full[df_full.Date>='2000-01-01']
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 182675 entries, 3490 to 5426
Columns: 203 entries, Open to ln_volume
dtypes: datetime64[ns](3), float64(129), int32(64), int64(5), object(2)
memory usage: 239.7+ MB


Q1: Dummies on Month and Week-of-Month

Find the ABSOLUTE CORRELATION VALUE of the most correlated dummy <month-week_of_month> with the binary outcome variable is_positive_growth_5d_future?

In [54]:
# what are the categorical features?
CATEGORICAL

['Month', 'month_wom', 'Weekday', 'Ticker', 'ticker_type']

In [33]:
df.loc[:,'Month'] = df.Month.dt.strftime('%B')

In [55]:
df['Month'].head(5)

3490    January
3491    January
3492    January
3493    January
3494    January
Name: Month, dtype: object

In [35]:
df.loc[:,'Weekday'] = df.Weekday.astype(str)

In [78]:
df.loc[:,'Week'] = df['Date'].apply(lambda d: (d.day-1) // 7 + 1)

In [100]:
df.head()

Unnamed: 0,Open,High,Low,Close,Adj Close_x,Volume,Ticker,Year,Month,Weekday,...,growth_brent_oil_90d,growth_brent_oil_365d,growth_btc_usd_1d,growth_btc_usd_3d,growth_btc_usd_7d,growth_btc_usd_30d,growth_btc_usd_90d,growth_btc_usd_365d,ln_volume,Week
3490,58.6875,59.3125,56.0,58.28125,36.065567,53228400.0,MSFT,2000,January,0,...,,,,,,,,,17.790103,1
3491,56.78125,58.5625,56.125,56.3125,34.847271,54119000.0,MSFT,2000,January,1,...,,,,,,,,,17.806696,1
3492,55.5625,58.1875,54.6875,56.90625,35.214706,64059600.0,MSFT,2000,January,2,...,,,,,,,,,17.975324,1
3493,56.09375,56.9375,54.1875,55.0,34.035072,54976600.0,MSFT,2000,January,3,...,,,,,,,,,17.822418,1
3494,54.3125,56.125,53.65625,55.71875,34.479843,62013600.0,MSFT,2000,January,4,...,,,,,,,,,17.942864,1


In [115]:
df.loc[:,'month_wom'] = df.apply(lambda row: f"{row['Month']}_{row['Week']}", axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:,'month_wom'] = df.apply(lambda row: f"{row['Month']}_{row['Week']}", axis=1)


In [118]:
# Generate dummy variables (no need for bool, let's have int32 instead)
dummy_variables = pd.get_dummies(df[CATEGORICAL], dtype='int32')

In [119]:
dummy_variables.info()

<class 'pandas.core.frame.DataFrame'>
Index: 182675 entries, 3490 to 5426
Columns: 115 entries, Month_April to ticker_type_US
dtypes: int32(115)
memory usage: 85.6 MB


In [120]:
# get dummies names in a list
DUMMIES = dummy_variables.keys().to_list()

In [121]:
# Concatenate the dummy variables with the original DataFrame
df_with_dummies = pd.concat([df, dummy_variables], axis=1)

In [122]:
corr_is_positive_growth_5d_future = df_with_dummies[NUMERICAL+DUMMIES+TO_PREDICT].corr()['is_positive_growth_5d_future']

In [123]:
# create a dataframe for an easy way to sort
corr_is_positive_growth_5d_future_df = pd.DataFrame(corr_is_positive_growth_5d_future)

In [137]:
NEW_DUMMIES = [g for g in corr_is_positive_growth_5d_future_df.index.to_list() if (g.find('month_')==0)]

In [144]:
corr_is_positive_growth_5d_future_df_new_dummies = corr_is_positive_growth_5d_future_df[corr_is_positive_growth_5d_future_df.index.isin(NEW_DUMMIES)]

In [145]:
corr_is_positive_growth_5d_future_df_new_dummies["abs_corr"] = abs(corr_is_positive_growth_5d_future_df_new_dummies['is_positive_growth_5d_future'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  corr_is_positive_growth_5d_future_df_new_dummies["abs_corr"] = abs(corr_is_positive_growth_5d_future_df_new_dummies['is_positive_growth_5d_future'])


In [151]:
corr_is_positive_growth_5d_future_df_new_dummies.sort_values(by='abs_corr').tail()

Unnamed: 0,is_positive_growth_5d_future,abs_corr
month_wom_May_4,0.022264,0.022264
month_wom_February_3,-0.024578,0.024578
month_wom_October_5,0.026023,0.026023
month_wom_March_4,0.026058,0.026058
month_wom_September_3,-0.034537,0.034537
