In [2]:
!pip install yfinance

Collecting yfinance
  Obtaining dependency information for yfinance from https://files.pythonhosted.org/packages/14/03/3c854ca3e02eedf614abba4b2e177c469bf3af58207fa30d5098c5d652fe/yfinance-0.2.37-py2.py3-none-any.whl.metadata
  Downloading yfinance-0.2.37-py2.py3-none-any.whl.metadata (11 kB)
Collecting multitasking>=0.0.7 (from yfinance)
  Obtaining dependency information for multitasking>=0.0.7 from https://files.pythonhosted.org/packages/3e/8a/bb3160e76e844db9e69a413f055818969c8acade64e1a9ac5ce9dfdcf6c1/multitasking-0.0.11-py3-none-any.whl.metadata
  Downloading multitasking-0.0.11-py3-none-any.whl.metadata (5.5 kB)
Collecting frozendict>=2.3.4 (from yfinance)
  Downloading frozendict-2.4.0.tar.gz (314 kB)
     ---------------------------------------- 0.0/314.6 kB ? eta -:--:--
     ------------- ------------------------ 112.6/314.6 kB 2.2 MB/s eta 0:00:01
     -------------------------------------  307.2/314.6 kB 4.7 MB/s eta 0:00:01
     -------------------------------------  307.

In [3]:
import pandas as pd
import yfinance as yf

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.metrics import mean_squared_error


- `yfinance` docs: https://github.com/ranaroussi/yfinance
- What is `SPY`? See here: https://finance.yahoo.com/quote/SPY/

### Get Data

In [30]:
df_spy = yf.download('SPY', start='2023-01-01', end='2024-03-10')

[*********************100%%**********************]  1 of 1 completed


In [31]:
df_spy

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-01-03,384.369995,386.429993,377.829987,380.820007,375.118744,74850700
2023-01-04,383.179993,385.880005,380.000000,383.760010,378.014709,85934100
2023-01-05,381.720001,381.839996,378.760010,379.380005,373.700256,76970500
2023-01-06,382.609985,389.250000,379.410004,388.079987,382.269989,104189600
2023-01-09,390.369995,393.700012,387.670013,387.859985,382.053284,73978100
...,...,...,...,...,...,...
2024-02-28,505.329987,506.859985,504.959991,506.260010,506.260010,56506600
2024-02-29,508.070007,509.739990,505.350006,508.079987,508.079987,83924800
2024-03-01,508.980011,513.289978,508.559998,512.849976,512.849976,76805900
2024-03-04,512.030029,514.200012,512.000000,512.299988,512.299988,49799300


### Modelling # Prediction

In [32]:
# prepare the data
X = np.array(range(len(df_spy))).reshape(-1, 1)  # assuming X is just the time index
y = df_spy['Close'].values  # closing prices as the target

# split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# initialize and train the model
model = LinearRegression()
model.fit(X_train, y_train)

# predict
predictions = model.predict(X_test)


In [33]:
# calculate the MSE for your predictions
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')


Mean Squared Error: 232.40241740548413


In [34]:
# future indices for 20 business days from March 11 to April 5
future_indices = np.array(range(len(df_spy), len(df_spy) + 20)).reshape(-1, 1)

# using the model to predict future prices
future_predictions = model.predict(future_indices)

# generating dates from March 11 to April 5, 2024 (20 business days)
start_date = pd.Timestamp('2024-03-11')
dates = pd.date_range(start_date, periods=20, freq='B')

# creating the df with the two columns requested in the assignment 
forecast_df = pd.DataFrame({
    'Date': dates,
    'Predicted Closing Value': future_predictions
})

forecast_df

Unnamed: 0,Date,Predicted Closing Value
0,2024-03-11,484.14625
1,2024-03-12,484.471317
2,2024-03-13,484.796384
3,2024-03-14,485.121451
4,2024-03-15,485.446518
5,2024-03-18,485.771586
6,2024-03-19,486.096653
7,2024-03-20,486.42172
8,2024-03-21,486.746787
9,2024-03-22,487.071854


### Save forecast

In [None]:
firstname = '' # fill in your first name
lastname = '' # fill in your last name
filename = f'{firstname}_{lastname}.csv'

df.forecast.columns = ['date', f'{firstname}_{lastname}']

df_forecast.to_csv(filename, index = 0)

### What to turn in?

- Your Jupyter notebook which builds the model and makes the forecast. (Just for curiosity.)
- A csv file with your forecasts with the following columns   
      - date: dates   
      - firstname_lastname: forecast SPY closing prices