## Importing the essential libraries over here

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Importing the dataset over here

In [2]:
data=pd.read_csv('AMZN.csv')

In [3]:
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,1997-05-15,2.4375,2.5,1.927083,1.958333,1.958333,72156000
1,1997-05-16,1.96875,1.979167,1.708333,1.729167,1.729167,14700000
2,1997-05-19,1.760417,1.770833,1.625,1.708333,1.708333,6106800
3,1997-05-20,1.729167,1.75,1.635417,1.635417,1.635417,5467200
4,1997-05-21,1.635417,1.645833,1.375,1.427083,1.427083,18853200


## Taking care of duplicate observations over here

In [4]:
data.duplicated().sum()

0

## Taking care of missing values if present over here

In [6]:
data.isnull().sum()

Date         0
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64

## Filtering all the numerical features over here

In [7]:
numerical_features=[feature for feature in data.columns if data[feature].dtype!='O']
for feature in numerical_features:
  print(feature)

Open
High
Low
Close
Adj Close
Volume


In [8]:
data[numerical_features]

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
0,2.437500,2.500000,1.927083,1.958333,1.958333,72156000
1,1.968750,1.979167,1.708333,1.729167,1.729167,14700000
2,1.760417,1.770833,1.625000,1.708333,1.708333,6106800
3,1.729167,1.750000,1.635417,1.635417,1.635417,5467200
4,1.635417,1.645833,1.375000,1.427083,1.427083,18853200
...,...,...,...,...,...,...
6252,3136.260010,3231.879883,3120.219971,3225.010010,3225.010010,5141300
6253,3222.419922,3261.679932,3191.060059,3229.830078,3229.830078,3326900
6254,3236.110107,3323.340088,3233.979980,3297.780029,3297.780029,3204300
6255,3274.100098,3327.399902,3253.739990,3268.159912,3268.159912,2790600


## Filtering all the categorical features over here

In [9]:
cat_features=[feature for feature in data.columns if data[feature].dtype=='O']
for feature in cat_features:
  print(feature)

Date


In [10]:
data[cat_features]

Unnamed: 0,Date
0,1997-05-15
1,1997-05-16
2,1997-05-19
3,1997-05-20
4,1997-05-21
...,...
6252,2022-03-18
6253,2022-03-21
6254,2022-03-22
6255,2022-03-23


## Encoding the categorical features over here

In [11]:
for feature in cat_features:
  feature_mapping={category:index for index,category in enumerate(data[feature].unique())}
  data[feature]=data[feature].map(feature_mapping)

In [12]:
data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,0,2.437500,2.500000,1.927083,1.958333,1.958333,72156000
1,1,1.968750,1.979167,1.708333,1.729167,1.729167,14700000
2,2,1.760417,1.770833,1.625000,1.708333,1.708333,6106800
3,3,1.729167,1.750000,1.635417,1.635417,1.635417,5467200
4,4,1.635417,1.645833,1.375000,1.427083,1.427083,18853200
...,...,...,...,...,...,...,...
6252,6252,3136.260010,3231.879883,3120.219971,3225.010010,3225.010010,5141300
6253,6253,3222.419922,3261.679932,3191.060059,3229.830078,3229.830078,3326900
6254,6254,3236.110107,3323.340088,3233.979980,3297.780029,3297.780029,3204300
6255,6255,3274.100098,3327.399902,3253.739990,3268.159912,3268.159912,2790600


## Creating the features and labels over here

In [13]:
data['OPEN']=data['Open']
data.drop(['Open'],axis=1,inplace=True)

In [14]:
X=data.iloc[:,:-1].values
y=data.iloc[:,-1].values

## Splitting the dataset into training set and testing set over here to avoid the problem of overfitting

In [15]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

## Training the model on the training dataset over here

In [16]:
from sklearn.ensemble import RandomForestRegressor
regressor=RandomForestRegressor(n_estimators=10,random_state=0)
regressor.fit(X_train,y_train)

## Evaluating the performance of the model on the testing dataset over here

In [17]:
y_pred=regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),1))

[[  15.83   16.2 ]
 [3123.34 3127.89]
 [ 957.56  960.11]
 ...
 [  32.99   33.15]
 [  82.92   83.24]
 [ 955.72  964.  ]]


## Checking the metrics of the model over here

In [18]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.9998241469686423