In [78]:
import datetime
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split, KFold

In [69]:
df=pd.read_csv(r"/Users/michaelschaid/GitHub/stock_market_prediction/sphist.csv")

def tweak_data(df):
    return (df
            .rename(lambda col: col.lower(),axis=1)
            .sort_values(by='date')  
            .assign(date = lambda df_: df_.date.astype('datetime64'),
                    avg_5_open= lambda df_: df_.open.shift(1).rolling(window=5).sum(),
                    avg_365_open= lambda df_: df_.open.shift(1).rolling(window=365).sum(),
                    std_5 = lambda df_: df_.open.shift(1).rolling(window=5).std(),
                    std_365 = lambda df_: df_.open.shift(1).rolling(window=365).std(),
                    day_year_open_ratio = lambda df_: df_.avg_5_open/df_.avg_365_open,
                    day_year_std_ratio = lambda df_: df_.std_5/df_.std_365
                    )
            .dropna(axis=0)
            
            )
    
stocks =tweak_data(df)       

stocks


In [80]:
def split_data_by_date(data,date):
    train = data.query("date < @date")
    test = data.query("date > @date")
    return train, test

def train_regr(data):
    train, test = split_data(data)
    
    features = ['avg_5_open', 'avg_365_open', 'std_5',
                'std_365', 'day_year_open_ratio', 'day_year_std_ratio']
    
    target = 'close'
    
    model = LinearRegression().fit(train[features], train[target]) 
    predict_close = model.predict(test[features])
    rmse = np.sqrt(mean_squared_error(test[target], predict_close))
    mae = mean_absolute_error(test[target], predict_close)
    return rmse,mae 

train_regr(stocks)
    

(26.18373476706312, 19.165580108739576)

In [77]:
stocks.columns

Index(['date', 'open', 'high', 'low', 'close', 'volume', 'adj close',
       'avg_5_open', 'avg_365_open', 'std_5', 'std_365', 'day_year_open_ratio',
       'day_year_std_ratio'],
      dtype='object')