## Importing the essential libraries over here

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Importing the dataset over here

In [2]:
data=pd.read_csv("Zomato Dataset.csv")

In [3]:
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,23-07-2021,116.0,138.899994,115.0,126.0,126.0,694895290
1,26-07-2021,126.349998,143.75,125.300003,140.649994,140.649994,249723854
2,27-07-2021,141.699997,147.800003,127.75,132.899994,132.899994,240341900
3,28-07-2021,131.0,135.0,123.550003,131.199997,131.199997,159793731
4,29-07-2021,134.949997,144.0,132.199997,141.550003,141.550003,117973089


## Taking care of duplicate observations over here

In [4]:
data.duplicated().sum()

0

## Taking care of missing values if present over here

In [5]:
missing_values=[feature for feature in data.columns if data[feature].isnull().sum()>1]
for feature in missing_values:
  print(feature)

## Filtering all the numerical features over here

In [6]:
numerical_features=[feature for feature in data.columns if data[feature].dtype!='O']
for feature in numerical_features:
  print(feature)

Open
High
Low
Close
Adj Close
Volume


## Filtering all the categorical features over here

In [7]:
cat_features=[feature for feature in data.columns if data[feature].dtype=='O']
for feature in cat_features:
  print(feature)

Date


## Encoding the categorical variables over here into an numerical representation so as to train the machine learning model

In [8]:
for feature in cat_features:
  feature_mapping={category:index for index,category in enumerate(data[feature].unique())}
  data[feature]=data[feature].map(feature_mapping)

In [9]:
data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,0,116.000000,138.899994,115.000000,126.000000,126.000000,694895290
1,1,126.349998,143.750000,125.300003,140.649994,140.649994,249723854
2,2,141.699997,147.800003,127.750000,132.899994,132.899994,240341900
3,3,131.000000,135.000000,123.550003,131.199997,131.199997,159793731
4,4,134.949997,144.000000,132.199997,141.550003,141.550003,117973089
...,...,...,...,...,...,...,...
693,693,188.800003,193.399994,183.149994,192.000000,192.000000,44772997
694,694,192.899994,195.850006,189.000000,195.199997,195.199997,35478593
695,695,195.149994,196.000000,193.100006,194.850006,194.850006,20841351
696,696,194.399994,196.500000,188.750000,189.100006,189.100006,33453174


## Creating the features and labels over here

In [10]:
data['HIGH']=data['High']

In [11]:
data.drop("High",axis=1,inplace=True)

In [12]:
X=data.iloc[:,:-1].values
y=data.iloc[:,-1].values

## Splitting the dataset into training set and testing set to avoid the problem of overfitting

In [15]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

## Training the model on the training dataset over here

In [16]:
from sklearn.ensemble import RandomForestRegressor
regressor=RandomForestRegressor(n_estimators=10,random_state=0)
regressor.fit(X_train,y_train)

## Evaluating the performance of the model on the testing dataset over here

In [17]:
y_pred=regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),1))

[[124.34 125.55]
 [135.05 137.3 ]
 [ 65.11  66.85]
 [156.09 158.45]
 [ 52.78  52.85]
 [107.95 109.25]
 [137.58 137.4 ]
 [167.25 164.4 ]
 [189.08 189.25]
 [118.41 119.2 ]
 [ 99.01  99.3 ]
 [102.59 100.6 ]
 [ 63.57  63.2 ]
 [157.73 153.75]
 [146.68 144.  ]
 [ 52.15  52.3 ]
 [ 99.9   98.85]
 [168.84 173.5 ]
 [ 63.74  63.65]
 [125.3  126.35]
 [ 84.39  83.95]
 [ 71.93  72.7 ]
 [ 85.8   85.6 ]
 [120.87 120.5 ]
 [ 51.72  51.8 ]
 [ 94.19  93.25]
 [127.28 125.6 ]
 [146.04 146.85]
 [ 79.14  80.35]
 [ 64.77  64.9 ]
 [ 74.76  74.85]
 [140.84 143.75]
 [141.53 139.55]
 [115.01 115.75]
 [ 85.76  84.6 ]
 [ 55.97  56.5 ]
 [ 85.66  84.4 ]
 [ 54.    53.2 ]
 [ 62.94  63.2 ]
 [134.96 135.1 ]
 [101.28 100.1 ]
 [ 79.12  82.  ]
 [ 83.77  86.  ]
 [ 66.77  68.25]
 [122.18 123.9 ]
 [193.71 196.5 ]
 [ 63.78  64.3 ]
 [ 64.92  64.9 ]
 [ 86.87  85.7 ]
 [ 66.93  68.2 ]
 [159.22 162.25]
 [102.36 102.5 ]
 [ 65.02  64.8 ]
 [ 68.69  69.25]
 [ 56.99  57.25]
 [142.2  141.  ]
 [101.08 100.4 ]
 [ 64.96  65.3 ]
 [117.89 117.8

In [18]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.9986735935570963