## Importing the essential libraries over here

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Importing the dataset over here

In [2]:
data=pd.read_csv('CVS_Health.csv')

In [3]:
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,1973-02-22,1.65625,1.65625,1.65625,1.65625,1.65625,92800
1,1973-02-23,1.703125,1.703125,1.703125,1.703125,1.703125,400000
2,1973-02-26,1.671875,1.671875,1.671875,1.671875,1.671875,187200
3,1973-02-27,1.546875,1.546875,1.546875,1.546875,1.546875,657600
4,1973-02-28,1.65625,1.65625,1.65625,1.65625,1.65625,235200


## Taking care of duplicate observations if present over here

In [4]:
data.duplicated().sum()

0

## Taking care of missing values if present over here

In [5]:
data.isnull().sum()

Date         0
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64

In [6]:
missing_values=[feature for feature in data.columns if data[feature].isnull().sum()>1]
for feature in missing_values:
  print(feature)

## Filtering all the numerical features over here

In [7]:
numerical_features=[feature for feature in data.columns if data[feature].dtype!="O"]
for feature in numerical_features:
  print(feature)

Open
High
Low
Close
Adj Close
Volume


In [8]:
data[numerical_features]

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
0,1.656250,1.656250,1.656250,1.656250,1.656250,92800
1,1.703125,1.703125,1.703125,1.703125,1.703125,400000
2,1.671875,1.671875,1.671875,1.671875,1.671875,187200
3,1.546875,1.546875,1.546875,1.546875,1.546875,657600
4,1.656250,1.656250,1.656250,1.656250,1.656250,235200
...,...,...,...,...,...,...
12941,60.000000,61.240002,59.840000,61.090000,61.090000,7791300
12942,61.299999,61.869999,60.869999,61.009998,61.009998,8908800
12943,60.810001,61.330002,60.400002,61.000000,61.000000,6899500
12944,61.119999,61.630001,60.480000,61.369999,61.369999,20869800


## Filtering all the categorical features into numerical features over here

In [9]:
cat_features = [feature for feature in data.columns if data[feature].dtype=="O"]
for feature in cat_features:
  print(feature)

Date


In [10]:
data[cat_features]

Unnamed: 0,Date
0,1973-02-22
1,1973-02-23
2,1973-02-26
3,1973-02-27
4,1973-02-28
...,...
12941,2024-06-17
12942,2024-06-18
12943,2024-06-20
12944,2024-06-21


## Encoding the categorical features into numerical features over here

In [11]:
for feature in cat_features:
  feature_mapping={category:index for index,category in enumerate(data[feature].unique())}
  data[feature]=data[feature].map(feature_mapping)

In [13]:
data.head(2)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,0,1.65625,1.65625,1.65625,1.65625,1.65625,92800
1,1,1.703125,1.703125,1.703125,1.703125,1.703125,400000


## Creating the features and labels over here

In [14]:
data['OPEN']=data['Open']
data.drop('Open',axis=1,inplace=True)

In [15]:
X=data.iloc[:,:-1].values
y=data.iloc[:,-1].values

## Splitting the dataset into training set and testing set to avoid the problem of overfitting over here

In [16]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

## Training the model on the training dataset over here

In [17]:
from sklearn.ensemble import RandomForestRegressor
regressor=RandomForestRegressor(n_estimators=10,random_state=0)
regressor.fit(X_train,y_train)

## Evaluating the performance of the model on the testing dataset over here

In [18]:
y_pred=regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),1))

[[ 79.9   79.45]
 [ 45.53  45.67]
 [ 19.17  19.12]
 ...
 [ 17.27  17.38]
 [  8.25   8.22]
 [103.46 103.58]]


## Checking the metrics r2 score over here

In [19]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.9998850400742683