## PEPSI STOCK PRICE ANALYSIS AND PREDICTION

In [36]:
pip install yfinance

In [37]:
pip install chart_studio

#### Importation of important libraries

In [38]:
%config IPCompleter.greedy=True

In [40]:
import pandas as pd
import numpy as np
import math
from datetime import date
import matplotlib.pyplot as plt
import seaborn as sns
import cufflinks as cf
import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
cf.go_offline()
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,LSTM
from scipy.stats import norm


%matplotlib inline

#### Downloading the dataset from yahoo finance

In [41]:
from pandas_datareader import data as pdr
import yfinance as yf

yf.pdr_override()

df = pdr.get_data_yahoo("PEP", start="2012-01-01", end=date.today().strftime('%Y-%m-%d'))

In [42]:
df.to_csv('Pepsi stock price data')

#### First observations of the dataset

In [43]:
df.head(10).style.background_gradient(cmap="Reds")

In [44]:
df.tail(10).style.background_gradient(cmap="Blues")

In [45]:
df.info()

This dataset contains Pepsi stock price values from "3rd Jan, 2012" till date. It contains 10years of Pepsi stock data price data with 2649 entries and no missing data.

#### Summary of statistics performed on the dataset

In [46]:
df.describe().style.background_gradient(cmap="Greens")

In [47]:
df[df['High'] == df['High'].max()]

The highest value of Pepsi stocks between Jan 2012 and July 2022 was on the 28th April, 2022 with an opening price of 174.5 and a closing price of 177.5

In [48]:
df[df['Low'] == df['Low'].min()]

The lowest value of pepsi stocks between Jan 2012 and July 2022 was on 7th March, 2012 with an opening price of 62.3 and a closing price of 62.6

### Data Visualization

In [49]:
fig = plt.figure(figsize=(20,10))
sns.set_style('darkgrid')

cm = ['Green','red']

df[['High', 'Low']].iplot(kind='line', title="High vs Low values of Pepsi stocks",
                          color=cm, xaxis_title='Date')
                                          
df[['Open', 'Close']].iplot(kind='line', title="Opening vs Closing values of pepsi stocks", 
                            colors=cm, xaxis_title='Date')

In [50]:
df['MA100'] = df['Close'].rolling(100).mean()

In [51]:
df[['Close', 'MA100']].iplot(kind='line', title= "Close price vs 100days moving average", xaxis_title='Date')

##### CREATING THE DAILY RETURNS COLUMN

In [52]:
df['Daily returns'] = df['Adj Close'].pct_change(1).fillna(0)

In [53]:
df.head()

#### ANALYSIS ON THE DAILY RETURNS
##### probability distrubution of the returns

In [54]:
fig = plt.figure(figsize=(10,5))
sns.set_style('darkgrid')
plt.hist(df['Daily returns'],bins=10,label="Daily close price")
plt.legend()

In [55]:
fig = plt.figure(figsize=(10,5))
plt.boxplot(x=df['Daily returns'])
plt.show()

In [56]:
df['Daily returns'].describe()

##### best day of stock

In [57]:
df[df['Daily returns'] == df['Daily returns'].max()]

The highest daily return was on 17th march, 2020 with a return of 0.129

##### worst day of stock

In [58]:
df[df['Daily returns'] == df['Daily returns'].min()]

the lowest daily return was on 20th March, 2020 with a daily return of -0.11

#### Plotting daily returns over time

In [59]:
fig = plt.figure(figsize=(10,5))
sns.set_style('darkgrid')
df['Daily returns'].plot()
plt.xlabel("Date", fontsize=15)
plt.ylabel("Percent", fontsize=15)
plt.title("Pepsi Stock Daily Returns", fontsize=15)

#### Moving avg vs close price

In [60]:
fig = plt.figure(figsize=(10,5))
df['Close'].rolling(window=30).mean().plot(label='30 Day Avg')
df['Close'].plot(label='CLOSE price')
plt.title("Comparison of the moving average & Close price", fontsize=17)
plt.legend(fontsize=10)

#### Are the Daily returns normally distributed

In [61]:
fig = plt.figure(figsize=(10,7))
t = np.linspace(0.01,0.99,1000)
q1 = np.quantile(df['Daily returns'],t)
q2 = norm.ppf(t,loc=np.mean(df['Daily returns']),scale=np.std(df['Daily returns']))
plt.plot(q1,q2)
plt.plot([min(q1),max(q1)],[min(q2),max(q2)])
plt.xlim((min(q1),max(q1)))
plt.ylim((min(q2),max(q2)))
plt.xlabel("Daily returns", fontsize=17)
plt.ylabel("Normal distribution", fontsize=17)
plt.show()

From the graph above, the straight line is the normal distribution but the daily returns is the blue line with isn't straight therefore indicating that the daily returns isn't distributed normally

### Training the dataset

In [62]:
data=df.sort_index(ascending=True,axis=0)

In [63]:
# 1. Filter out the closing market price data
close_data = df.filter(['Close'])
 
# 2. Convert the data into array for easy evaluation
dataset = close_data.values
 
# 3. Scale/Normalize the data to make all values between 0 and 1
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(dataset)
 
# 4. Creating training data size : 70% of the data
training_data_len = math.ceil(len(dataset) * 0.7)
train_data = scaled_data[0:training_data_len  , : ]
 
# 5. Separating the data into x and y data
x_train_data=[]
y_train_data =[]
for i in range(60,len(train_data)):
    x_train_data=list(x_train_data)
    y_train_data=list(y_train_data)
    x_train_data.append(train_data[i-60:i,0])
    y_train_data.append(train_data[i,0])
 
    # 6. Converting the training x and y values to numpy arrays
    x_train_data1, y_train_data1 = np.array(x_train_data), np.array(y_train_data)
 
    # 7. Reshaping training x and y data to make the calculations easier
    x_train_data2 = np.reshape(x_train_data1, (x_train_data1.shape[0],x_train_data1.shape[1],1))

#### Building the LSTM model

In [64]:
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(x_train_data2.shape[1],1)))
model.add(LSTM(units=50, return_sequences=False))
model.add(Dense(units=25))
model.add(Dense(units=1))

In [74]:
model.summary()

#### Compiling and Training the model

In [65]:
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(x_train_data2, y_train_data1, epochs=100, batch_size=32, verbose=2)

#### Testing the model

In [66]:
# 1. Creating a dataset for testing
test_data = scaled_data[training_data_len - 60: , : ]
x_test = []
y_test =  dataset[training_data_len : , : ]
for i in range(60,len(test_data)):
    x_test.append(test_data[i-60:i,0])
 
# 2.  Convert the values into arrays for easier computation
x_test = np.array(x_test)
x_test = np.reshape(x_test, (x_test.shape[0],x_test.shape[1],1))
 
# 3. Making predictions on the testing data
predictions = model.predict(x_test)
predictions = scaler.inverse_transform(predictions)

In [71]:
len(x_test)

In [67]:
train = data[:training_data_len]
valid = data[training_data_len:]
 
valid['Predictions'] = predictions
 
fig = plt.figure(figsize=(15, 10)) 
plt.title('Model')
plt.xlabel('Date')
plt.ylabel('Close')
 
plt.plot(train['Close'])
plt.plot(valid[['Close', 'Predictions']])
 
plt.legend(['Train', 'Val', 'Predictions'], loc='lower right')
 
plt.show()

In [68]:
rmse=np.sqrt(np.mean(((predictions- y_test)**2)))
print(rmse)

In [69]:
valid[['Close', 'Predictions']].head(10)

In [70]:
model.save('Stock_pred_model.h5')