In [1]:
# Importing all necessary python libraries
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [2]:
#writing function that will prepare dataset so that we will fit it in Linear Regression Model
def prepare_data(df,forecast_col,forecast_out,test_size):
    label = df[forecast_col].shift(-forecast_out) #creating new column called label with the last 5 rows are nan
    X = np.array(df[[forecast_col]]) #creating the feature array
    X = preprocessing.scale(X) #processing the feature array
    X_lately = X[-forecast_out:] #creating the column i want to use later in the predicting method
    X = X[:-forecast_out] # X that will contain the training and testing
    label.dropna(inplace=True) #dropping na values
    y = np.array(label)  # assigning Y
    X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=test_size, random_state=0) #cross validation

    response = [X_train,X_test , Y_train, Y_test , X_lately]
    return response

In [3]:
# Reading the data
df = pd.read_csv('GOOG.csv')

In [4]:
stock_data = df

In [5]:
stock_data.head()

Unnamed: 0,symbol,date,close,high,low,open,volume,adjClose,adjHigh,adjLow,adjOpen,adjVolume,divCash,splitFactor
0,GOOG,2016-06-14 00:00:00+00:00,718.27,722.47,713.12,716.48,1306065,718.27,722.47,713.12,716.48,1306065,0.0,1.0
1,GOOG,2016-06-15 00:00:00+00:00,718.92,722.98,717.31,719.0,1214517,718.92,722.98,717.31,719.0,1214517,0.0,1.0
2,GOOG,2016-06-16 00:00:00+00:00,710.36,716.65,703.26,714.91,1982471,710.36,716.65,703.26,714.91,1982471,0.0,1.0
3,GOOG,2016-06-17 00:00:00+00:00,691.72,708.82,688.4515,708.65,3402357,691.72,708.82,688.4515,708.65,3402357,0.0,1.0
4,GOOG,2016-06-20 00:00:00+00:00,693.71,702.48,693.41,698.77,2082538,693.71,702.48,693.41,698.77,2082538,0.0,1.0


In [6]:
stock_data.size

17612

In [7]:
stock_data.info

In [8]:
stock_data.describe()

Unnamed: 0,close,high,low,open,volume,adjClose,adjHigh,adjLow,adjOpen,adjVolume,divCash,splitFactor
count,1258.0,1258.0,1258.0,1258.0,1258.0,1258.0,1258.0,1258.0,1258.0,1258.0,1258.0,1258.0
mean,1216.317067,1227.430934,1204.17643,1215.260779,1601590.0,1216.317067,1227.430936,1204.176436,1215.260779,1601590.0,0.0,1.0
std,383.333358,387.570872,378.777094,382.446995,696017.2,383.333358,387.570873,378.777099,382.446995,696017.2,0.0,0.0
min,668.26,672.3,663.284,671.0,346753.0,668.26,672.3,663.284,671.0,346753.0,0.0,1.0
25%,960.8025,968.7575,952.1825,959.005,1173522.0,960.8025,968.7575,952.1825,959.005,1173522.0,0.0,1.0
50%,1132.46,1143.935,1117.915,1131.15,1412588.0,1132.46,1143.935,1117.915,1131.15,1412588.0,0.0,1.0
75%,1360.595,1374.345,1348.5575,1361.075,1812156.0,1360.595,1374.345,1348.5575,1361.075,1812156.0,0.0,1.0
max,2521.6,2526.99,2498.29,2524.92,6207027.0,2521.6,2526.99,2498.29,2524.92,6207027.0,0.0,1.0


In [9]:
stock_data.describe().T #T = Transpose

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
close,1258.0,1216.317,383.333358,668.26,960.8025,1132.46,1360.595,2521.6
high,1258.0,1227.431,387.570872,672.3,968.7575,1143.935,1374.345,2526.99
low,1258.0,1204.176,378.777094,663.284,952.1825,1117.915,1348.557,2498.29
open,1258.0,1215.261,382.446995,671.0,959.005,1131.15,1361.075,2524.92
volume,1258.0,1601590.0,696017.226844,346753.0,1173522.0,1412588.5,1812156.0,6207027.0
adjClose,1258.0,1216.317,383.333358,668.26,960.8025,1132.46,1360.595,2521.6
adjHigh,1258.0,1227.431,387.570873,672.3,968.7575,1143.935,1374.345,2526.99
adjLow,1258.0,1204.176,378.777099,663.284,952.1825,1117.915,1348.557,2498.29
adjOpen,1258.0,1215.261,382.446995,671.0,959.005,1131.15,1361.075,2524.92
adjVolume,1258.0,1601590.0,696017.226844,346753.0,1173522.0,1412588.5,1812156.0,6207027.0


In [10]:
stock_data.isna().sum()

Unnamed: 0,0
symbol,0
date,0
close,0
high,0
low,0
open,0
volume,0
adjClose,0
adjHigh,0
adjLow,0


In [11]:
import plotly.express as px

In [12]:
# Data visualization
fig = px.line(stock_data, x='date', y='volume', title='Daily Trading Volume')
fig.update_xaxes(title='Date')
fig.update_yaxes(title='Volume')
fig.update_layout(template='plotly_dark')
fig.show()

In [13]:
fig = px.line(stock_data, x='date', y='close', title='Closing Prices Over Years')
fig.update_xaxes(title='Date')
fig.update_yaxes(title='Closing Price')
fig.update_layout(template='plotly_dark')
fig.show()

In [14]:
import plotly.graph_objects as go

In [15]:
fig.add_trace(go.Scatter(x=stock_data['date'], y=stock_data['open'], mode='lines+markers', name='Open'))
fig.add_trace(go.Scatter(x=stock_data['date'], y=stock_data['high'], mode='lines+markers', name='High'))
fig.add_trace(go.Scatter(x=stock_data['date'], y=stock_data['low'], mode='lines+markers', name='Low'))
fig.add_trace(go.Scatter(x=stock_data['date'], y=stock_data['close'], mode='lines+markers', name='Close'))

fig.update_layout(title='Stock Price Analysis',
                  xaxis_title='Date',
                  yaxis_title='Price')

fig.show()

In [16]:
daily_changes = stock_data['close'].diff()
fig = px.histogram(daily_changes, nbins=50, title='Histogram of Daily Price Changes')
fig.update_xaxes(title='Daily Price Change')
fig.update_yaxes(title='Frequency')
fig.update_layout(template='plotly_dark')
fig.show()

In [17]:
forecast_col = 'close' #input variable mentioning about which column we want to predict
forecast_out = 5 #how much far we want to predict
test_size = 0.2 #how much should be the size of the test set

In [18]:
#split the data and fit into the linear regression model
X_train, X_test, Y_train, Y_test , X_lately =prepare_data(df,forecast_col,forecast_out,test_size);
#calling the method were the cross validation and data preperation is in
learner = LinearRegression() #initializing linear regression model

learner.fit(X_train,Y_train) #training the linear regression model

In [19]:
#predict the output and have a look at the prices of the stock prices
score=learner.score(X_test,Y_test)#testing the linear regression model
forecast= learner.predict(X_lately) #set that will contain the forecasted data
response={}#creting json object
response['test_score']=score
response['forecast_set']=forecast

print(response)

{'test_score': 0.9869925788439492, 'forecast_set': array([2487.99519278, 2504.95266337, 2513.60340165, 2544.15922575,
       2536.39885586])}
