# Analysis Supporting Importance of Time in Market

### Objective
- todo Incorporate a interactive widget that allows you to change the number of days to skip
    - [example](https://towardsdatascience.com/best-practices-for-writing-reproducible-and-maintainable-jupyter-notebooks-49fcc984ea68)


### Highlights
- A handful of daily returns make a significant difference 

In [83]:
# import libraries
import datetime as dt
import pandas as pd
import yfinance as yf
from dateutil.relativedelta import relativedelta
from ipywidgets import interact
import ipywidgets as widgets
import numpy as np
import matplotlib.pyplot as plt
from dataclasses import dataclass
from typing import Optional

## Import Data

In [84]:
# Get historical stock data
stock_ticker = yf.Ticker('SPY')
stock_df = stock_ticker.history(period="max")

For the purposes of creating the functions below I'm importing historical SPY data since it closely follows the S&P 500 which closely follows 500 of the largest stocks in the United States. Below we'll examine the initial imported raw data before we begin cleaning and preprocessing.

In [85]:
stock_df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Capital Gains
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1993-01-29 00:00:00-05:00,24.858341,24.858341,24.734668,24.840673,1003200,0.0,0.0,0.0
1993-02-01 00:00:00-05:00,24.858339,25.017347,24.858339,25.017347,480500,0.0,0.0,0.0
1993-02-02 00:00:00-05:00,24.999678,25.088016,24.946675,25.070349,201300,0.0,0.0,0.0
1993-02-03 00:00:00-05:00,25.105690,25.353037,25.088022,25.335369,529400,0.0,0.0,0.0
1993-02-04 00:00:00-05:00,25.423704,25.494375,25.141022,25.441372,531500,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
2023-12-29 00:00:00-05:00,476.489990,477.029999,473.299988,475.309998,122234100,0.0,0.0,0.0
2024-01-02 00:00:00-05:00,472.160004,473.670013,470.489990,472.649994,123623700,0.0,0.0,0.0
2024-01-03 00:00:00-05:00,470.429993,471.190002,468.170013,468.790009,103585900,0.0,0.0,0.0
2024-01-04 00:00:00-05:00,468.299988,470.959991,467.049988,467.279999,84232200,0.0,0.0,0.0


In [86]:
stock_df.info()
stock_df.describe().round(3)

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 7790 entries, 1993-01-29 00:00:00-05:00 to 2024-01-05 00:00:00-05:00
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Open           7790 non-null   float64
 1   High           7790 non-null   float64
 2   Low            7790 non-null   float64
 3   Close          7790 non-null   float64
 4   Volume         7790 non-null   int64  
 5   Dividends      7790 non-null   float64
 6   Stock Splits   7790 non-null   float64
 7   Capital Gains  7790 non-null   float64
dtypes: float64(7), int64(1)
memory usage: 547.7 KB


Unnamed: 0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Capital Gains
count,7790.0,7790.0,7790.0,7790.0,7790.0,7790.0,7790.0,7790.0
mean,142.217,143.062,141.294,142.228,84485240.0,0.012,0.0,0.0
std,112.624,113.248,111.965,112.653,92311840.0,0.109,0.0,0.0
min,24.54,24.611,24.205,24.54,5200.0,0.0,0.0,0.0
25%,70.647,71.171,70.071,70.632,10034680.0,0.0,0.0,0.0
50%,93.526,94.29,92.748,93.488,63217100.0,0.0,0.0,0.0
75%,181.347,181.812,180.447,181.356,115985200.0,0.0,0.0,0.0
max,476.88,477.55,476.26,476.69,871026300.0,1.906,0.0,0.0


### Raw Data Observation
- There is daily stock data spanning about 30 years. I'll want the ability to filter this time frame to simulate different scenarios.
- There is no missing data
- I'm going to be only be using the Date and Close columns for the analysis

## Cleaning

In [87]:
# reset index to place datetime in series and clean new datetime series
stock_df.reset_index(inplace=True)
stock_df['Date'] = pd.to_datetime(stock_df['Date']).dt.date

In [88]:
stock_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7790 entries, 0 to 7789
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Date           7790 non-null   object 
 1   Open           7790 non-null   float64
 2   High           7790 non-null   float64
 3   Low            7790 non-null   float64
 4   Close          7790 non-null   float64
 5   Volume         7790 non-null   int64  
 6   Dividends      7790 non-null   float64
 7   Stock Splits   7790 non-null   float64
 8   Capital Gains  7790 non-null   float64
dtypes: float64(7), int64(1), object(1)
memory usage: 547.9+ KB


In [89]:
# create stock dataclass
@dataclass
class StockData:
    ticker: str
    start_date: dt.date
    end_date: dt.date
    periods_year: Optional[int] = None
    stock_df: Optional[pd.DataFrame] = None
    description: Optional[str] = 'This is the original dataset.'
    return_pct: Optional[float] = None
    annualized_return_pct: Optional[float] = None
    best_x_indices: Optional[list] = None

    def get_stock_data(self):
        stock_ticker = yf.Ticker(self.ticker)
        stock_df = stock_ticker.history(period="max")

        self.stock_df = stock_df

    def clean_stock_df(self):
        self.stock_df = self.stock_df.reset_index()
        self.stock_df.columns = self.stock_df.columns.str.strip().str.replace(' ', '_').str.lower()
        # get list of column headers that are objects
        object_cols = self.stock_df.select_dtypes(include=object).columns
        # strip all string series
        self.stock_df[object_cols] = self.stock_df[object_cols].apply(lambda ser: ser.str.strip())

        self.stock_df['date'] = pd.to_datetime(self.stock_df['date']).dt.date

    # Filter df to specific timeframe
    def filter_stock_df(self):
        self.stock_df = self.stock_df[
            (self.stock_df['date'] >= self.start_date)
            & (self.stock_df['date'] <= self.end_date)
            ]

    # Calculate return percentage
    def get_return_pct(self):
        # take only first and last row which would be the min and max date
        first_and_last_rows = self.stock_df.iloc[[0, -1]]
        # find percentage difference between the min and max date
        self.return_pct = first_and_last_rows['close'].pct_change().iloc[-1]

    # Calculate annualized return percentage
    def get_annualized_return_pct(self):
        self.annualized_return_pct = (1 + self.return_pct) ** (1 / self.periods_year) - 1

    # Create df to simulate skipping best x days
    def remove_best_x_days(self, number_of_days_to_remove):
        self.description = f'This scenario simulates removing the best performing {number_of_days_to_remove} days.'

        # Get indices of top x rows
        self.best_x_indices = self.stock_df.nlargest(number_of_days_to_remove,
                                                               'return_pct_vs_prev_day').index
        # replace top x indices with 0 and store in alternate series
        self.stock_df.loc[self.best_x_indices, 'return_pct_vs_prev_day'] = 0

        # calculate alternate close price
        temp_close_value = self.stock_df.iloc[0]['close']
        for index, row in self.stock_df.iterrows():
            new_return_pct = row['return_pct_vs_prev_day']
            temp_close_value = temp_close_value + (temp_close_value * new_return_pct)
            self.stock_df.at[index, 'close'] = temp_close_value

        # Update attributes since the close prices have been updated
        self.update_attributes()

        return self

    def update_attributes(self):
        # get return percentage
        self.get_return_pct()

        # get annualized return percentage
        self.get_annualized_return_pct()

    def __post_init__(self):
        # get stock data
        self.get_stock_data()
        # clean stock data
        self.clean_stock_df()
        # filter stock data by date
        self.filter_stock_df()
        # get periods in years
        self.periods_year = relativedelta(self.end_date, self.start_date).years
        # check if end date is after start date and raise error if true
        if self.periods_year < 0:
            raise ValueError(
                f"The start_date argument, {self.start_date}, should be before the end_date argument, {self.end_date}")
        # get return percentage compared to previous day
        self.stock_df['return_pct_vs_prev_day'] = self.stock_df['close'].pct_change().fillna(0)
        # Update attributes
        self.update_attributes()

In [99]:
def generate_scenarios(ticker, start_date, end_date, number_of_days_to_remove):
    original_scn = StockData(ticker, start_date, end_date)
    best_x_days_removed_scn = StockData(ticker, start_date, end_date).remove_best_x_days(number_of_days_to_remove)

    return original_scn, best_x_days_removed_scn


def visualize_stock_df(original_scn, best_x_days_removed_scn):
    # define series to be plotted
    y_og_close_line = original_scn.stock_df['close']
    y_scn_close_line = best_x_days_removed_scn.stock_df['close']
    x = original_scn.stock_df['date']

    # assign plot lines
    plt.plot(x, y_og_close_line)
    plt.plot(x, y_scn_close_line)

    # add title
    plt.title(f'{original_scn.ticker} comparison between original Close and missing the best {best_x_days_removed_scn.best_x_indices.size} return days removed for a {original_scn.periods_year} year period')

    # todo add line key

    # annotate starting price
    plt.annotate('$'+'%0.2f' % y_og_close_line.iloc[0], xy=(0, y_og_close_line.iloc[0]), xytext=(8, 0),
                 xycoords=('axes fraction', 'data'), textcoords='offset points')\

    # annotate close and alt close price
    for var in (y_og_close_line, y_scn_close_line):
        plt.annotate('$'+'%0.2f' % var.iloc[-1], xy=(1, var.iloc[-1]), xytext=(8, -15),
                     xycoords=('axes fraction', 'data'), textcoords='offset points')

    # annotate original annualized close price
    plt.annotate(format(original_scn.annualized_return_pct, ".2%"), xy=(1, y_og_close_line.iloc[-1]), xytext=(8, -28),
                 xycoords=('axes fraction', 'data'), textcoords='offset points')
    # annotate alt annualized close price
    plt.annotate(format(best_x_days_removed_scn.annualized_return_pct, ".2%"), xy=(1, y_scn_close_line.iloc[-1]), xytext=(8, -28),
                 xycoords=('axes fraction', 'data'), textcoords='offset points')

    # highlight all the mismatch days. Show marker and percent label
    plt.show()

In [100]:
@interact(ticker='SPY', start_date='2015-01-01', end_date='2023-01-01', n=widgets.IntSlider(min=1, max=30, step=1))

#todo add error adjustments

def plot_scenario(ticker, start_date, end_date, n):
    ticker = ticker
    start_date = dt.datetime.strptime(start_date, '%Y-%m-%d').date()
    end_date = dt.datetime.strptime(end_date, '%Y-%m-%d').date()
    original_scn, best_x_days_removed_scn = generate_scenarios(ticker, start_date, end_date, n)
    visualize_stock_df(original_scn, best_x_days_removed_scn)



interactive(children=(Text(value='SPY', description='ticker'), Text(value='2015-01-01', description='start_dat…