## Importing the libraries over here

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Importing the dataset over here

In [2]:
data=pd.read_csv("zomato.csv")

In [3]:
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2021-07-23,116.0,138.899994,115.0,126.0,126.0,694895290
1,2021-07-26,126.349998,143.75,125.300003,140.649994,140.649994,249723854
2,2021-07-27,141.699997,147.800003,127.75,132.899994,132.899994,240341900
3,2021-07-28,131.0,135.0,123.550003,131.199997,131.199997,159793731
4,2021-07-29,134.949997,144.0,132.199997,141.550003,141.550003,117973089


## Taking care of missing values if present over here

In [4]:
data.isnull().sum()

Unnamed: 0,0
Date,0
Open,0
High,0
Low,0
Close,0
Adj Close,0
Volume,0


## Taking care of duplicate observations if present over here

In [5]:
data.duplicated().sum()

0

## Checking the statistics of the model over here

In [6]:
data.describe()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
count,631.0,631.0,631.0,631.0,631.0,631.0
mean,90.011252,91.835737,87.868304,89.707686,89.707686,67317370.0
std,32.757639,33.319545,31.900849,32.6219,32.6219,74610300.0
min,40.849998,44.400002,40.599998,41.650002,41.650002,0.0
25%,62.549999,63.450001,61.125,62.074998,62.074998,28007880.0
50%,80.0,81.0,78.099998,79.699997,79.699997,47597100.0
75%,124.474998,126.75,121.525002,124.599998,124.599998,75254390.0
max,161.149994,169.0,154.25,160.300003,160.300003,694895300.0


## Filtering all the numerical features over here

In [7]:
numerical_features=[feature for feature in data.columns if data[feature].dtype!="O"]
for feature in numerical_features:
  print(feature)

Open
High
Low
Close
Adj Close
Volume


In [9]:
data[numerical_features]

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
0,116.000000,138.899994,115.000000,126.000000,126.000000,694895290
1,126.349998,143.750000,125.300003,140.649994,140.649994,249723854
2,141.699997,147.800003,127.750000,132.899994,132.899994,240341900
3,131.000000,135.000000,123.550003,131.199997,131.199997,159793731
4,134.949997,144.000000,132.199997,141.550003,141.550003,117973089
...,...,...,...,...,...,...
626,141.000000,143.500000,138.550003,140.550003,140.550003,70252449
627,141.800003,145.000000,141.449997,143.800003,143.800003,78666454
628,145.000000,145.399994,138.250000,140.250000,140.250000,54189688
629,140.399994,141.800003,138.050003,139.949997,139.949997,46782951


## Filtering all the categorical features over here

In [10]:
cat_features=[feature for feature in data.columns if data[feature].dtype=="O"]
for feature in cat_features:
  print(feature)

Date


In [11]:
data[cat_features]

Unnamed: 0,Date
0,2021-07-23
1,2021-07-26
2,2021-07-27
3,2021-07-28
4,2021-07-29
...,...
626,2024-02-01
627,2024-02-02
628,2024-02-05
629,2024-02-06


## Converting the above categorical features into numerical feature

In [None]:
for feature in cat_features:
  feature_mapping={category:index for index,category in enumerate(data[feature].unique())}
  data[feature]=data[feature].map(feature_mapping)

In [14]:
data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,0,116.000000,138.899994,115.000000,126.000000,126.000000,694895290
1,1,126.349998,143.750000,125.300003,140.649994,140.649994,249723854
2,2,141.699997,147.800003,127.750000,132.899994,132.899994,240341900
3,3,131.000000,135.000000,123.550003,131.199997,131.199997,159793731
4,4,134.949997,144.000000,132.199997,141.550003,141.550003,117973089
...,...,...,...,...,...,...,...
626,626,141.000000,143.500000,138.550003,140.550003,140.550003,70252449
627,627,141.800003,145.000000,141.449997,143.800003,143.800003,78666454
628,628,145.000000,145.399994,138.250000,140.250000,140.250000,54189688
629,629,140.399994,141.800003,138.050003,139.949997,139.949997,46782951


## Creating the features and labels over here

In [15]:
data['OPEN']=data['Open']
data.drop("Open",axis=1,inplace=True)

In [16]:
X=data.iloc[:,:-1].values
y=data.iloc[:,-1].values

## Splitting the dataset into training set and testing set to avoid the problem of overfitting over here

In [19]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

## Training the model on the training dataset over here

In [20]:
from sklearn.ensemble import RandomForestRegressor
regressor=RandomForestRegressor(n_estimators=100,random_state=0)
regressor.fit(X_train,y_train)

## Evaluating the performance of the model on the testing dataset over here

In [21]:
y_pred=regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),1))

[[ 50.95  50.8 ]
 [ 64.23  63.45]
 [ 62.49  62.9 ]
 [ 53.77  53.75]
 [ 75.53  75.45]
 [103.85 103.8 ]
 [ 54.92  56.2 ]
 [ 56.63  58.4 ]
 [ 78.22  77.05]
 [ 84.61  88.1 ]
 [ 94.62  92.75]
 [ 70.8   71.  ]
 [ 62.81  62.75]
 [ 73.45  74.4 ]
 [138.62 137.3 ]
 [ 64.28  64.  ]
 [ 65.73  68.  ]
 [ 55.52  55.  ]
 [ 53.22  53.6 ]
 [ 78.96  79.  ]
 [ 50.16  50.7 ]
 [138.95 139.1 ]
 [104.49 105.6 ]
 [138.39 142.  ]
 [136.1  133.05]
 [ 93.39  93.4 ]
 [ 66.51  63.25]
 [ 84.36  84.4 ]
 [ 64.73  65.5 ]
 [ 63.2   62.1 ]
 [ 48.56  50.  ]
 [ 91.4   93.35]
 [130.97 129.  ]
 [ 59.18  56.4 ]
 [ 64.09  63.1 ]
 [ 71.13  71.55]
 [ 59.42  58.7 ]
 [ 97.14  98.3 ]
 [ 80.89  82.  ]
 [ 67.99  68.4 ]
 [ 59.93  58.1 ]
 [ 57.04  58.4 ]
 [133.93 132.8 ]
 [ 52.39  51.2 ]
 [137.66 137.3 ]
 [ 53.66  53.4 ]
 [ 74.76  75.55]
 [137.12 136.75]
 [ 81.82  81.3 ]
 [ 67.16  67.3 ]
 [ 57.68  53.95]
 [140.12 138.95]
 [ 63.26  63.  ]
 [ 59.26  59.  ]
 [ 56.9   56.1 ]
 [ 48.49  47.9 ]
 [154.09 158.45]
 [133.78 134.5 ]
 [ 83.56  82.5

## Finding the R2-Squared metrics over here

In [22]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.9960583830637018