In [2]:
#!pip install yfinance

In [1]:
import pandas as pd
import yfinance as yf

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.metrics import mean_squared_error


- `yfinance` docs: https://github.com/ranaroussi/yfinance
- What is `SPY`? See here: https://finance.yahoo.com/quote/SPY/

### Get Data

In [2]:
df_spy = yf.download('SPY', start='2024-01-01', end='2024-03-10')

[*********************100%%**********************]  1 of 1 completed


In [3]:
df_spy

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2024-01-02,472.160004,473.670013,470.48999,472.649994,471.186005,123623700
2024-01-03,470.429993,471.190002,468.170013,468.790009,467.337982,103585900
2024-01-04,468.299988,470.959991,467.049988,467.279999,465.832642,84232200
2024-01-05,467.48999,470.440002,466.429993,467.920013,466.470673,86060800
2024-01-08,468.429993,474.75,468.299988,474.600006,473.129974,74879100
2024-01-09,471.869995,474.929993,471.350006,473.880005,472.412201,65931400
2024-01-10,474.160004,477.450012,473.869995,476.559998,475.083893,67310600
2024-01-11,477.589996,478.119995,472.26001,476.350006,474.874542,77940700
2024-01-12,477.839996,478.600006,475.230011,476.679993,475.203522,57944000
2024-01-16,475.26001,476.609985,473.059998,474.929993,473.458923,85014900


### Modelling # Prediction

In [4]:
# prepare the data
X = np.array(range(len(df_spy))).reshape(-1, 1)  # assuming X is just the time index
y = df_spy['Close'].values  # closing prices as the target

# split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# initialize and train the model
model = LinearRegression()
model.fit(X_train, y_train)

# predict
predictions = model.predict(X_test)


In [5]:
# calculate the MSE for your predictions
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')


Mean Squared Error: 11.9900225886263


In [6]:
# future indices for 20 business days from March 11 to April 5
future_indices = np.array(range(len(df_spy), len(df_spy) + 20)).reshape(-1, 1)

# using the model to predict future prices
future_predictions = model.predict(future_indices)

# generating dates from March 11 to April 5, 2024 (20 business days)
start_date = pd.Timestamp('2024-03-11')
dates = pd.date_range(start_date, periods=20, freq='B')

# creating the df with the two columns requested in the assignment 
forecast_df = pd.DataFrame({
    'Date': dates,
    'Predicted Closing Value': future_predictions
})

forecast_df

Unnamed: 0,Date,Predicted Closing Value
0,2024-03-11,515.604281
1,2024-03-12,516.60658
2,2024-03-13,517.608878
3,2024-03-14,518.611176
4,2024-03-15,519.613475
5,2024-03-18,520.615773
6,2024-03-19,521.618071
7,2024-03-20,522.62037
8,2024-03-21,523.622668
9,2024-03-22,524.624967


### Save forecast

In [None]:
firstname = '' # fill in your first name
lastname = '' # fill in your last name
filename = f'{firstname}_{lastname}.csv'

df.forecast.columns = ['date', f'{firstname}_{lastname}']

df_forecast.to_csv(filename, index = 0)

### What to turn in?

- Your Jupyter notebook which builds the model and makes the forecast. (Just for curiosity.)
- A csv file with your forecasts with the following columns   
      - date: dates   
      - firstname_lastname: forecast SPY closing prices