## Importing the essential libraries over here

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Importing the dataset over here

In [2]:
data=pd.read_csv("Starbucks Dataset.csv")

In [3]:
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,1992-06-26,0.328125,0.347656,0.320313,0.335938,0.260703,224358400
1,1992-06-29,0.339844,0.367188,0.332031,0.359375,0.278891,58732800
2,1992-06-30,0.367188,0.371094,0.34375,0.347656,0.269797,34777600
3,1992-07-01,0.351563,0.359375,0.339844,0.355469,0.27586,18316800
4,1992-07-02,0.359375,0.359375,0.347656,0.355469,0.27586,13996800


## Taking care of duplicate observations over here

In [4]:
data.duplicated().sum()

0

## Taking care of missing values over here

In [5]:
data.isnull().sum()

Date         0
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64

## Filtering all the numerical features over here

In [6]:
numerical_features=[feature for feature in data.columns if data[feature].dtype!='O']
for feature in numerical_features:
  print(feature)

Open
High
Low
Close
Adj Close
Volume


## Filtering all the categorical features over here

In [9]:
cat_features=[feature for feature in data.columns if data[feature].dtype=='O']
for feature in cat_features:
  print(feature)

Date


## Encoding the categorical feature into numerical feature over here

In [10]:
for feature in cat_features:
  feature_mapping={category:index for index,category in enumerate(data[feature].unique())}
  data[feature]=data[feature].map(feature_mapping)

In [11]:
data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,0,0.328125,0.347656,0.320313,0.335938,0.260703,224358400
1,1,0.339844,0.367188,0.332031,0.359375,0.278891,58732800
2,2,0.367188,0.371094,0.343750,0.347656,0.269797,34777600
3,3,0.351563,0.359375,0.339844,0.355469,0.275860,18316800
4,4,0.359375,0.359375,0.347656,0.355469,0.275860,13996800
...,...,...,...,...,...,...,...
8031,8031,75.269997,78.000000,74.919998,77.849998,77.849998,14436500
8032,8032,77.680000,78.320000,76.709999,77.540001,77.540001,11183800
8033,8033,77.559998,78.220001,77.500000,77.720001,77.720001,8916600
8034,8034,77.699997,81.019997,77.440002,80.720001,80.720001,22063400


## Creating features and labels over here

In [12]:
data['HIGH']=data['High']

In [13]:
data.drop(['High'],axis=1,inplace=True)

In [14]:
X=data.iloc[:,:-1].values
y=data.iloc[:,-1].values

## Splitting the dataset into training set and testing set to avoid the problem of overfitting

In [17]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

## Training the model on the training dataset over here

In [18]:
from sklearn.ensemble import RandomForestRegressor
regressor=RandomForestRegressor(n_estimators=10,random_state=0)
regressor.fit(X_train,y_train)

## Evaluating the performance of the model on the training dataset over here

In [19]:
y_pred=regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),1))

[[54.19 54.15]
 [11.49 11.51]
 [98.25 98.64]
 ...
 [ 0.36  0.37]
 [ 8.1   8.09]
 [44.13 44.6 ]]


## Checking the accuracy of the model using metrics called as r2 score over here

In [20]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.9999360333993439