<h1>Stock Prediction with Scikit-learn</h1>

In [None]:
import math
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import datetime
import pandas_datareader.data as web
from pandas import Series, DataFrame


start = datetime.datetime(2010, 1, 1)
end = datetime.date.today()

df = web.DataReader("AAPL", 'yahoo', start, end)
df.tail()


<h2>Rolling Mean / Moving Average(MA)</h2>

<h3>10 Day Moving Average</h3>

In [None]:
close_px = df['Adj Close']
mavg = close_px.rolling(window=100).mean()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import style

# Adjusting the size of matplotlib
import matplotlib as mpl
mpl.rc('figure', figsize=(8, 7))
mpl.__version__

# Adjusting the style of matplotlib
style.use('ggplot')

close_px.plot(label='AAPL')
mavg.plot(label='10 Day MA')
plt.legend()

<h3>50 Day Moving Average</h3>

In [None]:
close_px = df['Adj Close']
mavg = close_px.rolling(window=500).mean()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import style

# Adjusting the size of matplotlib
import matplotlib as mpl
mpl.rc('figure', figsize=(8, 7))
mpl.__version__

# Adjusting the style of matplotlib
style.use('ggplot')

close_px.plot(label='AAPL')
mavg.plot(label='50 Day MA')
plt.legend()

<h2>Return Deviation - to determine risk and return</h2>

Expected Return measures the mean, or expected value, of the probability distribution of investment returns. The expected return of a portfolio is calculated by multiplying the weight of each asset by its expected return and adding the values for each investment

In [None]:
rets = close_px / close_px.shift(1) - 1
rets.plot(label='return')

<h2>Analysing Multiple Stocks</h2>

In [None]:
dfcomp = web.DataReader(['FB', 'AAPL', 'AMZN', 'NFLX', 'GOOG'],'yahoo',start=start,end=end)['Adj Close']

In [None]:
print(dfcomp)

<h3>Correlation Analysis</h3>

In [None]:
retscomp = dfcomp.pct_change()

corr = retscomp.corr()

In [None]:
print(corr)

<h3>Return Distributions - Apple vs Google</h3>

In [None]:
plt.scatter(retscomp.AAPL, retscomp.GOOG)
plt.xlabel('Returns AAPL')
plt.ylabel('Returns GOOG')

<h2>Kernal Density Estimation (KDE)</h2>

In statistics, kernel density estimation (KDE) is a non-parametric way to estimate the probability density function of a random variable. Kernel density estimation is a fundamental data smoothing problem where inferences about the population are made, based on a finite data sample.

In [None]:
pd.plotting.scatter_matrix(retscomp, diagonal='kde', figsize=(10, 10));

<h2>Correlation Heat Map<h2>

In [None]:
plt.imshow(corr, cmap='hot', interpolation='none')
plt.colorbar()
plt.xticks(range(len(corr)), corr.columns)
plt.yticks(range(len(corr)), corr.columns);

<h2>Stock Returns Rate and Risk</h2>

In [None]:
plt.scatter(retscomp.mean(), retscomp.std())
plt.xlabel('Expected returns')
plt.ylabel('Risk')
for label, x, y in zip(retscomp.columns, retscomp.mean(), retscomp.std()):
    plt.annotate(
        label, 
        xy = (x, y), xytext = (20, -20),
        textcoords = 'offset points', ha = 'right', va = 'bottom',
        bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5),
        arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))

<h2>Predicting Stock Prices<h2>

In [None]:
dfreg = df.loc[:,['Adj Close','Volume']]
dfreg['HL_PCT'] = (df['High'] - df['Low']) / df['Close'] * 100.0
dfreg['PCT_change'] = (df['Close'] - df['Open']) / df['Open'] * 100.0

In [None]:
print(dfreg)

<h3>Pre-processing and Cross Validation</h3>

In [None]:
import math
import numpy as np
import sklearn

from sklearn import preprocessing

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor

from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

# Drop missing value
dfreg.fillna(value=-99999, inplace=True)
# We want to separate 1 percent of the data to forecast
forecast_out = int(math.ceil(0.01 * len(dfreg)))
# Separating the label here, we want to predict the AdjClose
forecast_col = 'Adj Close'
dfreg['label'] = dfreg[forecast_col].shift(-forecast_out)
X = np.array(dfreg.drop(['label'], 1))
# Scale the X so that everyone can have the same distribution for linear regression
X = preprocessing.scale(X)
# Finally We want to find Data Series of late X and early X (train) for model generation and evaluation
X_lately = X[-forecast_out:]
X = X[:-forecast_out]
# Separate label and identify it as y
y = np.array(dfreg['label'])
y = y[:-forecast_out]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
# Linear regression
clfreg = LinearRegression(n_jobs=-1)
clfreg.fit(X_train, y_train)
# Quadratic Regression 2
clfpoly2 = make_pipeline(PolynomialFeatures(2), Ridge())
clfpoly2.fit(X_train, y_train)

# Quadratic Regression 3
clfpoly3 = make_pipeline(PolynomialFeatures(3), Ridge())
clfpoly3.fit(X_train, y_train)

In [None]:
# KNN Regression
clfknn = KNeighborsRegressor(n_neighbors=2)
clfknn.fit(X_train, y_train)

In [None]:
confidencereg = clfreg.score(X_test, y_test)
confidencepoly2 = clfpoly2.score(X_test,y_test)
confidencepoly3 = clfpoly3.score(X_test,y_test)
confidenceknn = clfknn.score(X_test, y_test)

In [None]:
confidencereg

In [None]:
confidencepoly2

In [None]:
confidencepoly3

In [None]:
confidenceknn

In [None]:
forecast_set = clfreg.predict(X_lately)
dfreg['Forecast'] = np.nan

In [None]:
last_date = dfreg.iloc[-1].name
last_unix = last_date
next_unix = last_unix + datetime.timedelta(days=1)

for i in forecast_set:
    next_date = next_unix
    next_unix += datetime.timedelta(days=1)
    dfreg.loc[next_date] = [np.nan for _ in range(len(dfreg.columns)-1)]+[i]
dfreg['Adj Close'].tail(500).plot()
dfreg['Forecast'].tail(500).plot()
plt.legend(loc=4)
plt.xlabel('Date')
plt.ylabel('Price')
plt.show()

<h2>Scatter Plots - Volume vs Price</h2>

In [None]:
DATAPATH = '/home/jack/Desktop/CHGG.csv'

In [None]:
data = pd.read_csv(DATAPATH)
data.head()

In [None]:
def scatter_plot(feature, target):
    plt.figure(figsize=(16,8))
    plt.scatter(
        data[feature],
        data[target],
        c='black'
    )
    plt.xlabel("Date")
    plt.ylabel("Price")
    plt.show()

In [None]:
scatter_plot('Date','Close')

In [None]:
def vol_scatter_plot(feature, target):
    plt.figure(figsize=(16,8))
    plt.scatter(
        data[feature],
        data[target],
        c='black'
    )
    plt.xlabel("Volume")
    plt.ylabel("Price")
    plt.show()

In [None]:
vol_scatter_plot('Volume','Close')

<h2>Ridge Regression</h2>

In [None]:
from sklearn import linear_model

In [None]:
ridgereg = linear_model.Ridge(alpha=.5)

In [None]:
ridgereg.fit(X,y) 

In [None]:
ridgereg.coef_

In [None]:
ridgereg.intercept_

In [None]:
# Import necessary packages
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

import sklearn
from sklearn.linear_model import Ridge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import MinMaxScaler

In [None]:
# Download the data and store it
# Tesla stock since the beginning
!wget -O tesla.csv https://www.dropbox.com/s/na2vurooejpew59/TSLA.csv?dl=1

In [None]:
df = pd.read_csv('tesla.csv')

In [None]:
df.set_index('Date', inplace=True)
df.head()

In [None]:

df['Adj Close'].plot(label='TSLA', figsize=(16,8), title='Adjusted Closing Price', grid=True)

In [None]:

window_size = 32 # Allow us to look at 32 days into the past
# Prepare the data so that we have 32 day windows and predict what the next day should be

# Get indices of access for the data
num_samples = len(df) - window_size
indices = np.arange(num_samples).astype(np.int)[:,None] + np.arange(window_size + 1).astype(np.int)

In [None]:
data = df['Adj Close'].values[indices] # Create the 2D matrix of training samples


In [None]:

X = data[:,:-1] # Each row represents 32 days in the past
y = data[:,-1] # Each output value represents the 33rd day


In [None]:

# Train and test split
split_fraction = 0.8
ind_split = int(split_fraction * num_samples)
X_train = X[:ind_split]
y_train = y[:ind_split]
X_test = X[ind_split:]
y_test = y[ind_split:]

In [None]:
# Train
ridge_model = Ridge()
ridge_model.fit(X_train, y_train)

# Infer
y_pred_train_ridge = ridge_model.predict(X_train)
y_pred_ridge = ridge_model.predict(X_test)

In [None]:
# Plot what it looks like for the training data
df_ridge = df.copy()
df_ridge.drop(['Open', 'High', 'Low', 'Close', 'Volume'], axis=1, inplace=True)
df_ridge = df_ridge.iloc[window_size:ind_split] # Past 32 days we don't know yet
df_ridge['Adj Close Train'] = y_pred_train_ridge[:-window_size]
df_ridge.plot(label='TSLA', figsize=(16,8), title='Adjusted Closing Price', grid=True)

In [None]:

# Same for the test
df_ridge = df.copy()
df_ridge.drop(['Open', 'High', 'Low', 'Close', 'Volume'], axis=1, inplace=True)
df_ridge = df_ridge.iloc[ind_split+window_size:] # Past 32 days we don't know yet
df_ridge['Adj Close Test'] = y_pred_ridge
df_ridge.plot(label='TSLA', figsize=(16,8), title='Adjusted Closing Price', grid=True)

<h2>Lasso Regression</h2>

In [None]:
from sklearn.linear_model import Ridge, Lasso

# train
lasso_model = Lasso(alpha=0.1, tol=0.01)
lasso_model.fit(X_train, y_train)

# infer
y_pred_train_lasso = lasso_model.predict(X_train)
y_pred_lasso = lasso_model.predict(X_test)

In [None]:

# plot lasso training
df_lasso = df.copy()
df_lasso.drop(['Open', 'High','Low','Close','Volume'], axis=1, inplace=True)
df_lasso = df_lasso.iloc[window_size:ind_split]
df_lasso['Adj Close Train'] = y_pred_train_lasso[:-window_size]
df_lasso.plot(label='BTC-USD', figsize=(16,8), title='Adjusted Closing Price', grid=True)

In [None]:
# plot lasso testing
df_lasso = df.copy()
df_lasso.drop(['Open','High','Low','Close','Volume'], axis=1, inplace=True)
df_lasso = df_lasso.iloc[ind_split+window_size:]
df_lasso['Adj Close Test'] = y_pred_lasso
df_lasso.plot(label='TSLA', figsize=(16,8), title='Adjusted Closing Price', grid=True)

In [None]:

df_train = df.copy()
df_train.drop(['Open', 'High', 'Low', 'Close', 'Volume'], axis=1, inplace=True)
df_train = df_train.iloc[window_size:ind_split] # past 32 days we don't know
# add in all of our methods
df_train['Adj Close Train Ridge'] = y_pred_train_ridge[:-window_size]
df_train.plot(label='BTC-USD', figsize=(16,8), title='Adjusted Closing Price', grid=True)

In [None]:

# plot all testing data
df_test = df.copy()
df_test.drop(['Open', 'High', 'Low', 'Close', 'Volume'], axis=1, inplace=True)
df_test = df_test.iloc[ind_split+window_size:]
# add in all methods
df_test['Adj Close Test Ridge'] = y_pred_ridge

# plot
df_test.plot(label='TSLA', figsize=(16,8), title='Adjusted Closing Price', grid=True)

In [None]:

# plot all testing data
df_test = df.copy()
df_test.drop(['Open', 'High', 'Low', 'Close', 'Volume'], axis=1, inplace=True)
df_test = df_test.iloc[ind_split+window_size:]
# add in all methods
df_test['Adj Close Test Ridge'] = y_pred_ridge
# plot
df_test.plot(label='TSLA', figsize=(16,8), title='Adjusted Closing Price', grid=True)

In [None]:
scaler = MinMaxScaler(feature_range=(-1,1))
data_transform = scaler.fit_transform(df['Adj Close'].values[:,None])
data_transform = data_transform[indices]

num_days = 10 # predict next 10 days

# get the last window_size (32) days
data_seed = df['Adj Close'].values[-window_size:][None]
# get the normalized data as well for the neural network
data_seed_norm = scaler.transform(data_seed)
input_values = {"ridge": data_seed}
values = {"ridge": []}
for i in range(num_days):
  values["ridge"].append(ridge_model.predict(input_values["ridge"])[0])
  
  # dump the oldest price and put the newest price at the end
  for v in input_values:
    val = input_values[v]
    val = np.insert(val, -1, values[v][-1], axis=1)
    val = np.delete(val, 0, axis=1)
    input_values[v] = val.copy()
    
# convert all to NumPy arrays
for v in input_values:
  values[v] = np.array(values[v])


In [None]:

# plot next 10 days
from datetime import timedelta, datetime
today = str(df.index[-1])
last_date = datetime.strptime(today[:10], '%Y-%m-%d')
df_forecast = pd.DataFrame()
df_forecast["Ridge"] = values["ridge"]

df_forecast.index = pd.date_range(start = last_date, periods=num_days)
df_forecast.plot(label='TSLA', figsize=(16,8), title='Forecasted Adjusted Closing Price', grid=True)

<h2>Linear Regression</h2>

In [None]:
import pandas as pd
import datetime
import pandas_datareader.data as web
from pandas import Series, DataFrame
import pandas as pd
%matplotlib inline
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.pylab import rcParams

rcParams['figure.figsize'] = 20,10

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))

In [None]:
data = pd.read_csv("/home/jack/Desktop/CHGG.csv")
data.tail()

#setting index as date
data['Date'] = pd.to_datetime(data.Date,format='%Y-%m-%d')
data.index = data['Date']

#plot
plt.figure(figsize=(16,8))
plt.plot(data['Close'], label='Close Price history')

In [None]:
#get nrow for training/testing
nrow = len(data)
nrow

In [None]:
#sorting
data = data.sort_index(ascending=True, axis=0)

#creating a separate dataset
new_data = pd.DataFrame(index=range(0,len(data)),columns=['Date', 'Close'])

for i in range(0,len(data)):
    new_data['Date'][i] = data['Date'][i]
    new_data['Close'][i] = data['Close'][i]
    
new_data.head()

In [None]:
new_data['Date'] = new_data['Date'].apply(lambda x: 1)

In [None]:
#split into train and validation
train_nbr = nrow*(2/3)
train_nbr = int(round(train_nbr))
train = new_data[:train_nbr]
valid_nbr = nrow/3
valid_nbr = int(round(train_nbr))
valid = new_data[valid_nbr:]

x_train = train.drop('Close', axis=1)
y_train = train['Close']
x_valid = valid.drop('Close', axis=1)
y_valid = valid['Close']


In [None]:
valid.tail()

In [None]:
#implement linear regression
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(x_train,y_train)

In [None]:
#make predictions and find the rmse
preds = model.predict(x_valid)
rms=np.sqrt(np.mean(np.power((np.array(y_valid)-np.array(preds)),2)))
print("RMSE for Linear Regression:", rms)

In [None]:
#plot

valid['Predictions'] = 0
valid['Predictions'] = preds

valid.index = new_data[valid_nbr:].index
train.index = new_data[:train_nbr].index


plt.plot(train['Close'])
plt.plot(valid[['Close', 'Predictions']])

<h2>Support Vector Regression (SVR)</h2>

In [None]:
import quandl
import numpy as np 
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split

In [None]:
# Get the stock data
df = quandl.get("WIKI/AMZN")
# Take a look at the data
print(df.head())

In [None]:
# Get the Adjusted Close Price 
df = df[['Adj. Close']] 
# Take a look at the new data 
print(df.head())

In [None]:
# A variable for predicting 'n' days out into the future
forecast_out = 30 #'n=30' days
#Create another column (the target ) shifted 'n' units up
df['Prediction'] = df[['Adj. Close']].shift(-forecast_out)
#print the new data set
print(df.tail())

In [None]:
### Create the independent data set (X)  #######
# Convert the dataframe to a numpy array
X = np.array(df.drop(['Prediction'],1))

#Remove the last '30' rows
X = X[:-forecast_out]
print(X)

In [None]:
### Create the dependent data set (y)  #####
# Convert the dataframe to a numpy array 
y = np.array(df['Prediction'])
# Get all of the y values except the last '30' rows
y = y[:-forecast_out]
print(y)

In [None]:
# Split the data into 80% training and 20% testing
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
# Create and train the Support Vector Machine (Regressor) 
svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1) 
svr_rbf.fit(x_train, y_train)

In [None]:
# Create and train the Linear Regression  Model
lr = LinearRegression()
# Train the model
lr.fit(x_train, y_train)

In [None]:
# Testing Model: Score returns the coefficient of determination R^2 of the prediction. 
# The best possible score is 1.0
lr_confidence = lr.score(x_test, y_test)
print("lr confidence: ", lr_confidence)

In [None]:
# Testing Model: Score returns the coefficient of determination R^2 of the prediction. 
# The best possible score is 1.0
lr_confidence = lr.score(x_test, y_test)
print("lr confidence: ", lr_confidence)

In [None]:
# Set x_forecast equal to the last 30 rows of the original data set from Adj. Close column
x_forecast = np.array(df.drop(['Prediction'],1))[-forecast_out:]
print(x_forecast)

In [None]:
# Print linear regression model predictions for the next '30' days
lr_prediction = lr.predict(x_forecast)
print(lr_prediction)
# Print support vector regressor model predictions for the next '30' days
svm_prediction = svr_rbf.predict(x_forecast)
print(svm_prediction)

In [None]:
plt.plot(lr_prediction)
plt.xlabel("Next 30 Days")
plt.ylabel("AMZN Price")
plt.title("Linear Regression Prediction")