In [1]:
#Predicting Stock Prices Using Regression Analysis
# Standard data imports
import pandas as pd
import numpy as np
# Visualization 
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
# Scikit-learn metrics, functions and models
from sklearn import metrics




# Loading the CSV file from the file location dataset 
df = pd.read_csv('GOOG.csv')
# Printing the over view of the raw data set
df.head()




# Identifying the shape of the data. 
df.shape





# Dropping the variables we do not need. 
df = df.drop(columns=[
      'symbol', 'adjClose', 'adjHigh', 'adjLow', 'adjOpen', 'adjVolume', 'divCash', 'splitFactor'
],axis = 1) 
# Printing the dataset to review the remaining variables.  
df.head()





#Are there any duplicate values?
df.duplicated().sum().any()





# Checking for True or False for any nulls in the dataset  
df.isnull().values.any()
 
#checking & Reviewing DataFram Information
df.info()
 
#Overview descriptive statistic for data
df.describe()
 
#checking the .corr function to see the correlations between variables. Closest to one. 
print(df.corr())
 
# Visulization Correlations
# Heat Map of all variables
plt.figure(figsize=(16,8))
sns.heatmap(df.corr(), cmap="Blues", annot=True)
plt.show()
 
# Visualization overview of relationships in the dataset
# Showing visualizations on all variables in the data
sns.pairplot(df)
 
# Histogram visualization for each variable. 
df['open'].hist()
 
df['high'].hist()




df['low'].hist()
 
df['close'].hist()
 
df['volume'].hist()




# Review box plots 
f, axes = plt.subplots(1,4)
sns.boxplot( y='open', data=df, ax=axes[0])
sns.boxplot( y='high', data=df, ax=axes[1])
sns.boxplot( y='low', data=df, ax=axes[2])
sns.boxplot( y='close', data=df, ax=axes[3])
plt.tight_layout()





# Google Stock Price Analysis line graph by date
import plotly.graph_objects as go
 
figure = go.Figure(data=[go.Candlestick(x=df["date"],
                                        open=df["open"], high=df["high"],
                                        low=df["low"], close=df["close"])])
figure.update_layout(title = "Google Stock Price Analysis", xaxis_rangeslider_visible=False)
figure.show()
 
#Split the dataset
# Splitting the dataset 
 
X = df[['open', 'high', 'low', 'volume']].values # independant variables
y = df['close'].values # dependent variable
 
#Split the data: 80% train & 20% testing
from sklearn.model_selection import train_test_split
# Splitting the data 80% train and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)





# checking the shape for train data
print('Train:', X_train.shape)
print('Test:', X_test.shape)




# Training the Model Linear Regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix, accuracy_score
import statsmodels.api as sm
# Creating Regression Model
regressor = LinearRegression()
# fit linear regression model
model = regressor.fit(X_train, y_train)
# Use model to make predictions
y_pred = regressor.predict(X_test)




#Prediction
#with the test predictions complete, the next step will better compare them 
#with the actual output values for X_test by organizing them in a DataFrameformat
predicted = regressor.predict(X_test)
 
# X_test shape
predicted.shape





#Validating the fit
# Printout relevant metrics
print("Model Coefficients:", regressor.coef_)
#looking at the intercept
print("Model intercept:", regressor.intercept_)
 
#Prediction table of actual prices vs predicted values
dframe = pd.DataFrame(y_test,predicted)
dfr = pd.DataFrame({'Actual_Price':y_test,'Predicted_Price':predicted}) 
print(dfr)




#Stats on actual price & predicted price
dfr.describe()
 
#Normailty of residual
#this is the difference of y_test values subtacting the prediction values
residual = y_test - predicted
sns.distplot(residual)
 
#checking p-value with right tailed or upper tailed test
# Importing scipy library
import scipy.stats
 
# finding p-value
p_value = scipy.stats.norm.sf(abs(1.67))
print('p value is : ' + str(p_value))
 
#printing the OLS Regressin Resluts model 
results3 = sm.OLS(y_test,X_test).fit()
results3.summary()
 
#checking the regression score
from sklearn.metrics import confusion_matrix, accuracy_score
 
regression_confidence = regressor.score(X_test,y_test)
print("Linear regression confidence: ", regression_confidence )
 
#Evalutating the model - the closer to zero for all these metrics the better. 
import math
print('Mean Absolute Error:',metrics.mean_absolute_error(y_test,predicted))
print('Mean Squared Error:',metrics.mean_squared_error(y_test,predicted))
print('Root Mean Squared Error:',math.sqrt(metrics.mean_squared_error(y_test,predicted)))
 
#Model Accuracy
x2 = abs(predicted - y_test)
y2 = 100 * (x2 / y_test)
accuracy = 100 - np.mean(y2)
print('Accuracy:', round(accuracy, 2), '%.') 
 
#Plot
plt.scatter(dfr.Actual_Price, dfr.Predicted_Price,  color='Darkblue')
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.show()
 

 #Graph the first 10 values
# graph first 10 values
graph = dfr.head(10)
graph.plot(kind='bar')
 
#End of code…
 


ModuleNotFoundError: No module named 'seaborn'