# Logistic Regression on Large Dat

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

import yfinance as yf
yf.pdr_override()

In [2]:
# input
symbol = 'AMD'
start = '1980-01-01'
end = '2020-07-24'

# Read data 
dataset = yf.download(symbol,start,end)

# Only keep close columns 
dataset.head()

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Adj Close,Close,High,Low,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1980-03-17,3.145833,3.145833,3.302083,3.125,0.0,219600
1980-03-18,3.03125,3.03125,3.125,2.9375,0.0,727200
1980-03-19,3.041667,3.041667,3.083333,3.020833,0.0,295200
1980-03-20,3.010417,3.010417,3.0625,3.010417,0.0,159600
1980-03-21,2.916667,2.916667,3.020833,2.90625,0.0,130800


In [3]:
dataset['Increase_Decrease'] = np.where(dataset['Volume'].shift(-1) > dataset['Volume'],1,0)
dataset['Buy_Sell_on_Open'] = np.where(dataset['Open'].shift(-1) > dataset['Open'],1,0)
dataset['Buy_Sell'] = np.where(dataset['Adj Close'].shift(-1) > dataset['Adj Close'],1,0)
dataset['Returns'] = dataset['Adj Close'].pct_change()
dataset = dataset.dropna()

In [4]:
dataset.tail()

Unnamed: 0_level_0,Adj Close,Close,High,Low,Open,Volume,Increase_Decrease,Buy_Sell_on_Open,Buy_Sell,Returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2020-07-17,55.040001,55.040001,55.810001,54.68,55.310001,34710400,1,0,1,0.002185
2020-07-20,57.459999,57.459999,57.529999,54.830002,55.23,45034300,0,1,0,0.043968
2020-07-21,57.0,57.0,58.5,56.32,57.810001,44800700,1,0,1,-0.008006
2020-07-22,61.790001,61.790001,62.0,56.970001,57.07,135159400,0,1,0,0.084035
2020-07-23,59.57,59.57,62.330002,58.630001,61.630001,106829100,0,0,0,-0.035928


In [5]:
dataset.shape

(10175, 10)

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [7]:
X = dataset.iloc[ : , 0:8].values
y = dataset.iloc[ : , 8].values

In [8]:
# Standarize features
scaler = StandardScaler()
X_std = scaler.fit_transform(X)

In [9]:
# Create logistic regression object using sag solver
clf = LogisticRegression(random_state=0, solver='sag')

# Train model
model = clf.fit(X_std, y)

In [10]:
model.coef_

array([[-2.06282781, -2.06282781,  2.94853819,  1.85910412, -0.68592341,
         0.05855013,  0.15793275,  0.32223378]])

In [11]:
model.intercept_

array([-0.12478437])

In [12]:
model.score(X, y)

0.46958230958230956