## Importing the essential libraries over here

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

## Importing the dataset over here

In [2]:
data=pd.read_csv("EA_stock_price.csv")

In [3]:
data.head()

Unnamed: 0,Date,Open Price,High Price,Low Price,Close Price,Volume
0,1999-11-01,80.63,84.53,80.63,82.31,2452700.0
1,1999-11-02,82.44,82.44,78.63,79.25,823500.0
2,1999-11-03,79.06,80.5,77.13,77.44,941800.0
3,1999-11-04,77.56,79.5,77.5,79.25,1264400.0
4,1999-11-05,80.38,82.0,79.38,81.88,2016200.0


## Taking care of missing values if present over here

In [4]:
data.isnull().sum()

Unnamed: 0,0
Date,0
Open Price,0
High Price,0
Low Price,0
Close Price,0
Volume,0


## Taking care of duplicate observations over here

In [5]:
data.duplicated().sum()

0

## Filtering all the numerical features over here

In [6]:
numerical_features=[feature for feature in data.columns if data[feature].dtype!="O"]
for feature in numerical_features:
  print(feature)

Open Price
High Price
Low Price
Close Price
Volume


In [7]:
data[numerical_features].head()

Unnamed: 0,Open Price,High Price,Low Price,Close Price,Volume
0,80.63,84.53,80.63,82.31,2452700.0
1,82.44,82.44,78.63,79.25,823500.0
2,79.06,80.5,77.13,77.44,941800.0
3,77.56,79.5,77.5,79.25,1264400.0
4,80.38,82.0,79.38,81.88,2016200.0


## Filtering all the categorical features over here

In [8]:
cat_features=[feature for feature in data.columns if data[feature].dtype=="O"]
for feature in cat_features:
  print(feature)

Date


In [9]:
data[cat_features].head()

Unnamed: 0,Date
0,1999-11-01
1,1999-11-02
2,1999-11-03
3,1999-11-04
4,1999-11-05


## Encoding the categorical features into numerical features over here

In [10]:
for feature in cat_features:
  feature_mapping={category:index for index,category in enumerate(data[feature].unique())}
  data[feature]=data[feature].map(feature_mapping)

In [11]:
data

Unnamed: 0,Date,Open Price,High Price,Low Price,Close Price,Volume
0,0,80.63,84.530,80.630,82.31,2452700.0
1,1,82.44,82.440,78.630,79.25,823500.0
2,2,79.06,80.500,77.130,77.44,941800.0
3,3,77.56,79.500,77.500,79.25,1264400.0
4,4,80.38,82.000,79.380,81.88,2016200.0
...,...,...,...,...,...,...
6253,6253,144.44,145.160,143.110,143.80,1205649.0
6254,6254,144.04,144.140,140.825,143.70,1803503.0
6255,6255,144.25,145.130,143.140,144.75,1207084.0
6256,6256,145.22,146.444,144.520,145.83,1449061.0


## Creating the features and labels over here

In [18]:
data['OPEN']=data['Open Price']
data.drop("Open Price",axis=1,inplace=True)

In [19]:
X=data.iloc[:,:-1].values
y=data.iloc[:,-1].values

In [21]:
X.shape

(6258, 5)

In [22]:
y.shape

(6258,)

## Splitting the dataset into training set and testing set to avoid the problem of overfitting over here

In [23]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

## Training the model on the training dataset over here

In [24]:
from sklearn.ensemble import RandomForestRegressor
regressor=RandomForestRegressor()
regressor.fit(X_train,y_train)

## Evaluating the performance of the model on the testing set over here

In [26]:
y_pred=regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),1))

[[102.31 102.11]
 [118.68 117.87]
 [ 92.09  89.73]
 ...
 [ 19.22  19.63]
 [ 15.46  15.39]
 [127.04 127.83]]


## Printing the R2 Score over here

In [27]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.9993870017490664