## Importing the essential libraries over here

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Importing the dataset over here

In [2]:
data=pd.read_csv("HMC.csv")

In [3]:
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2020-01-02,28.6,28.65,28.459999,28.639999,26.480253,262800
1,2020-01-03,28.25,28.379999,28.08,28.129999,26.008713,663600
2,2020-01-06,27.719999,28.059999,27.719999,28.049999,25.934746,463000
3,2020-01-07,28.389999,28.389999,28.18,28.209999,26.082678,341800
4,2020-01-08,27.99,28.219999,27.99,28.129999,26.008713,264200


## Taking care of duplicate observations if present over here

In [4]:
data.duplicated().sum()

0

## Taking care of missing values if present over here

In [5]:
missing_values=[feature for feature in data.columns if data[feature].isnull().sum()]
for feature in missing_values:
  print(feature)

In [6]:
data.isnull().sum()

Date         0
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64

## Filtering all the numerical features over here

In [7]:
numerical_feature=[feature for feature in data.columns if data[feature].dtype!="O"]
for feature in numerical_feature:
  print(feature)

Open
High
Low
Close
Adj Close
Volume


## Filtering all the categorical features over here

In [8]:
cat_features=[feature for feature in data.columns if data[feature].dtype=="O"]
for feature in cat_features:
  print(feature)

Date


## Encoding the categorial feature in numerical value if present over here

In [9]:
date_mapping={}
for date,index in enumerate(data['Date'].unique()):
  date_mapping[index]=date
data['Date']=data['Date'].map(date_mapping)

In [10]:
data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,0,28.600000,28.650000,28.459999,28.639999,26.480253,262800
1,1,28.250000,28.379999,28.080000,28.129999,26.008713,663600
2,2,27.719999,28.059999,27.719999,28.049999,25.934746,463000
3,3,28.389999,28.389999,28.180000,28.209999,26.082678,341800
4,4,27.990000,28.219999,27.990000,28.129999,26.008713,264200
...,...,...,...,...,...,...,...
1084,1084,34.700001,34.700001,34.330002,34.410000,34.410000,699300
1085,1085,33.549999,33.730000,33.310001,33.650002,33.650002,1147200
1086,1086,33.639999,33.900002,33.630001,33.849998,33.849998,1287500
1087,1087,33.990002,34.110001,33.950001,34.099998,34.099998,888000


In [11]:
data['CLOSE']=data['Close']

In [12]:
data.drop("Close",axis=1,inplace=True)

## Creating the features and labels over here

In [13]:
X=data.iloc[:,:-1].values
y=data.iloc[:,-1].values

## Splitting the dataset into training set and testing set to avoid the problem of overfitting

In [14]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

## Training the model on the Training dataset over here

In [15]:
from sklearn.ensemble import RandomForestRegressor
regressor=RandomForestRegressor(n_estimators=10,random_state=0)
regressor.fit(X_train,y_train)

## Evaluating the performance of the model on the testing dataset over here

In [16]:
y_pred=regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),1))

[[25.63 25.53]
 [24.58 24.64]
 [29.46 29.64]
 [30.6  30.46]
 [27.35 27.37]
 [22.84 22.87]
 [27.03 26.92]
 [31.84 31.79]
 [32.84 32.86]
 [37.24 37.23]
 [29.02 29.05]
 [21.88 22.  ]
 [26.72 26.63]
 [26.48 26.38]
 [24.21 24.35]
 [30.54 30.38]
 [21.94 21.78]
 [31.66 31.86]
 [33.33 33.26]
 [27.94 28.09]
 [30.98 30.85]
 [24.48 24.51]
 [32.23 32.41]
 [22.86 22.84]
 [25.94 25.99]
 [30.51 30.45]
 [29.57 29.7 ]
 [34.03 34.1 ]
 [25.92 25.8 ]
 [25.59 25.64]
 [26.59 26.8 ]
 [24.12 24.05]
 [24.97 24.86]
 [25.28 25.25]
 [28.24 28.24]
 [26.87 26.79]
 [27.05 27.05]
 [26.01 26.17]
 [30.32 30.2 ]
 [25.93 25.87]
 [25.37 25.46]
 [26.02 26.15]
 [28.27 28.2 ]
 [29.64 29.64]
 [22.57 22.64]
 [26.02 26.13]
 [31.62 31.63]
 [29.63 29.59]
 [31.41 31.32]
 [26.54 26.52]
 [25.32 25.39]
 [30.33 30.37]
 [28.44 28.4 ]
 [25.13 25.04]
 [33.06 33.2 ]
 [32.77 32.53]
 [30.61 30.6 ]
 [29.13 29.2 ]
 [22.06 22.04]
 [28.96 29.1 ]
 [30.37 30.43]
 [25.92 25.82]
 [27.47 27.41]
 [25.72 25.62]
 [25.72 25.57]
 [26.05 25.83]
 [25.45 25

## Calculating the Metrics called as R2-Square over here

In [17]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.9985494087693207