# Train Test Split

Finance Historical - Features Analysis

In [1]:
# Library
import pandas as pd
import numpy as np
import math

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

import fix_yahoo_finance as yf
yf.pdr_override()

In [2]:
stock_name = 'AMD'
start = '2010-01-01' 
end = '2019-01-01'
df = yf.download(stock_name, start, end)
df = df.reset_index()

[*********************100%***********************]  1 of 1 downloaded


In [3]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2010-01-04,9.79,9.9,9.68,9.7,9.7,18748700
1,2010-01-05,9.71,9.9,9.68,9.71,9.71,22145700
2,2010-01-06,9.68,9.76,9.55,9.57,9.57,18643400
3,2010-01-07,9.51,9.55,9.18,9.47,9.47,26806800
4,2010-01-08,9.37,9.47,9.29,9.43,9.43,13752800


In [4]:
df.shape

(2264, 7)

In [5]:
df.describe()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
count,2264.0,2264.0,2264.0,2264.0,2264.0,2264.0
mean,7.154302,7.306564,6.99212,7.150115,7.150115,34578760.0
std,5.182738,5.321431,5.026407,5.179124,5.179124,33498310.0
min,1.62,1.69,1.61,1.62,1.62,0.0
25%,3.42,3.4675,3.33,3.3975,3.3975,14188100.0
50%,5.97,6.11,5.825,6.0,6.0,22953200.0
75%,9.115,9.32,8.9925,9.1175,9.1175,41710900.0
max,33.18,34.139999,32.189999,32.720001,32.720001,325058400.0


In [6]:
df.columns

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'], dtype='object')

In [7]:
df = df.drop(['Date'], axis=1)
df.head()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
0,9.79,9.9,9.68,9.7,9.7,18748700
1,9.71,9.9,9.68,9.71,9.71,22145700
2,9.68,9.76,9.55,9.57,9.57,18643400
3,9.51,9.55,9.18,9.47,9.47,26806800
4,9.37,9.47,9.29,9.43,9.43,13752800


In [8]:
from sklearn.linear_model import LinearRegression

data = df.copy()
target = data.pop('Adj Close')

lr = LinearRegression(fit_intercept=True)
lr.fit(data, target)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [9]:
from sklearn.metrics import mean_squared_error

print("R^2:", lr.score(data, target)) 

predictions = lr.predict(data)
mse = mean_squared_error(target, predictions)
rmse = np.sqrt(mse)
print("RMSE:", rmse)

R^2: 1.0
RMSE: 1.82037353877e-13


In [10]:
# Train & Test set Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, target, shuffle=True,
                                                    test_size=0.5, random_state=49)

In [11]:
# Fitting Model on Training Set
lr_split = LinearRegression(fit_intercept=True)
lr_split.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [12]:
# In-Sample Set
from sklearn.metrics import mean_squared_error
model = LinearRegression(fit_intercept=True)
model.fit(X_train, y_train)
predictions = model.predict(X_train)
mse = mean_squared_error(y_train, predictions)
rmse = np.sqrt(mse)
accuracy = model.score(X_train, y_train)

print('In-Sample Set')
print('MSE:', mse)
print('RMSE:', rmse)
print('Accuracy on X_train & y_train:', accuracy)

In-Sample Set
MSE: 2.53134444301e-26
RMSE: 1.59101993797e-13
Accuracy on X_train & y_train: 1.0


In [13]:
model = LinearRegression(fit_intercept=True)
model.fit(X_test, y_test)
predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
accuracy = model.score(X_test, y_test)

print('Out-of-Sample Set')
print('MSE:', mse)
print('RMSE:', rmse)
print('Accuracy on X_test & y_test:', accuracy)

Out-of-Sample Set
MSE: 3.90252316071e-26
RMSE: 1.97548048857e-13
Accuracy on X_test & y_test: 1.0


In [14]:
def calc_ISE(X_train, y_train, model):
    '''returns the in-sample R^2 and RMSE; assumes model already fit.'''
    predictions = model.predict(X_train)
    mse = mean_squared_error(y_train, predictions)
    rmse = np.sqrt(mse)
    return model.score(X_train, y_train), rmse
    
def calc_OSE(X_test, y_test, model):
    '''returns the out-of-sample R^2 and RMSE; assumes model already fit.'''
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mse)
    return model.score(X_test, y_test), rmse

In [15]:
is_r2, ise = calc_ISE(X_train, y_train, lr_split)
os_r2, ose = calc_OSE(X_test, y_test, lr_split)

# show dataset sizes
data_list = (('R^2_in', is_r2), ('R^2_out', os_r2), 
             ('ISE', ise), ('OSE', ose))
for item in data_list:
    print('{:10}: {}'.format(item[0], item[1]))

R^2_in    : 1.0
R^2_out   : 1.0
ISE       : 1.5910199379664295e-13
OSE       : 1.6411248849954045e-13


In [16]:
# create array of random_state values
random_states = np.random.randint(1, 100, size=5)
random_states

for random_state in random_states:
    # split data according to random state
    X_train, X_test, y_train, y_test = train_test_split(data, target, 
                                                        shuffle=True,
                                                        test_size=0.5, 
                                                        random_state=random_state)
    # instantiate mmodel
    lr = LinearRegression(fit_intercept=True)
    # fit model
    lr.fit(X_train, y_train)
    # capture key metrics
    is_r2, ise = calc_ISE(X_train, y_train, lr)
    os_r2, ose = calc_OSE(X_test, y_test, lr)
    # round values
    is_r2, os_r2 = round(is_r2, 4), round(os_r2, 4)
    ise, ose = round(ise, 4), round(ose, 4)
    
    # print key metrics
    print('Random State: {}'.format(random_state))
    print('IS_R^2: {} | IS_RMSE: {}'.format(is_r2, ise))
    print('OS_R^2: {} | OS_RMSE: {}'.format(os_r2, ose))
    print('-'*34)

Random State: 1
IS_R^2: 1.0 | IS_RMSE: 0.0
OS_R^2: 1.0 | OS_RMSE: 0.0
----------------------------------
Random State: 17
IS_R^2: 1.0 | IS_RMSE: 0.0
OS_R^2: 1.0 | OS_RMSE: 0.0
----------------------------------
Random State: 58
IS_R^2: 1.0 | IS_RMSE: 0.0
OS_R^2: 1.0 | OS_RMSE: 0.0
----------------------------------
Random State: 28
IS_R^2: 1.0 | IS_RMSE: 0.0
OS_R^2: 1.0 | OS_RMSE: 0.0
----------------------------------
Random State: 13
IS_R^2: 1.0 | IS_RMSE: 0.0
OS_R^2: 1.0 | OS_RMSE: 0.0
----------------------------------
