## Importing the essential libraries over here

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Importing the dataset over here

In [2]:
data=pd.read_csv("Samsung Dataset.csv")

In [3]:
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2000-01-04,6000.0,6110.0,5660.0,6110.0,4449.709961,74195000
1,2000-01-05,5800.0,6060.0,5520.0,5580.0,4063.72876,74680000
2,2000-01-06,5750.0,5780.0,5580.0,5620.0,4092.859863,54390000
3,2000-01-07,5560.0,5670.0,5360.0,5540.0,4034.598877,40305000
4,2000-01-10,5600.0,5770.0,5580.0,5770.0,4202.098145,46880000


## Taking care of duplicate observations over here

In [4]:
data.duplicated().sum()

0

## Taking care of missing values if present over here

In [5]:
data.isnull().sum()

Date         0
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64

## Filtering all the numerical features over here

In [6]:
numerical_features=[feature for feature in data.columns if data[feature].dtype!='O']
for feature in numerical_features:
  print(feature)

Open
High
Low
Close
Adj Close
Volume


In [7]:
data[numerical_features]

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
0,6000.0,6110.0,5660.0,6110.0,4449.709961,74195000
1,5800.0,6060.0,5520.0,5580.0,4063.728760,74680000
2,5750.0,5780.0,5580.0,5620.0,4092.859863,54390000
3,5560.0,5670.0,5360.0,5540.0,4034.598877,40305000
4,5600.0,5770.0,5580.0,5770.0,4202.098145,46880000
...,...,...,...,...,...,...
6122,78400.0,78600.0,77100.0,77300.0,77300.000000,21189349
6123,76100.0,76600.0,75600.0,75700.0,75700.000000,14598755
6124,75900.0,76000.0,75100.0,75200.0,75200.000000,16971175
6125,75300.0,77100.0,75200.0,76500.0,76500.000000,19246725


## Filtering all the categorical features over here

In [8]:
cat_features=[feature for feature in data.columns if data[feature].dtype=='O']
for feature in cat_features:
  print(feature)

Date


In [9]:
data[cat_features]

Unnamed: 0,Date
0,2000-01-04
1,2000-01-05
2,2000-01-06
3,2000-01-07
4,2000-01-10
...,...
6122,2024-06-07
6123,2024-06-10
6124,2024-06-11
6125,2024-06-12


## Encoding the categorical features over here

In [10]:
for feature in cat_features:
  feature_mapping={category:index for index,category in enumerate(data[feature].unique())}
  data[feature]=data[feature].map(feature_mapping)

In [11]:
data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,0,6000.0,6110.0,5660.0,6110.0,4449.709961,74195000
1,1,5800.0,6060.0,5520.0,5580.0,4063.728760,74680000
2,2,5750.0,5780.0,5580.0,5620.0,4092.859863,54390000
3,3,5560.0,5670.0,5360.0,5540.0,4034.598877,40305000
4,4,5600.0,5770.0,5580.0,5770.0,4202.098145,46880000
...,...,...,...,...,...,...,...
6122,6122,78400.0,78600.0,77100.0,77300.0,77300.000000,21189349
6123,6123,76100.0,76600.0,75600.0,75700.0,75700.000000,14598755
6124,6124,75900.0,76000.0,75100.0,75200.0,75200.000000,16971175
6125,6125,75300.0,77100.0,75200.0,76500.0,76500.000000,19246725


## Creating the features and labels over here

In [12]:
data['OPEN']=data['Open']
data.drop("Open",axis=1,inplace=True)

In [13]:
X=data.iloc[:,:-1].values
y=data.iloc[:,-1].values

## Splitting the dataset into training set and testing set to avoid the problem of overfitting over here

In [16]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

## Training the model on the training dataset over here

In [17]:
from sklearn.ensemble import RandomForestRegressor
regressor=RandomForestRegressor(n_estimators=10,random_state=0)
regressor.fit(X_train,y_train)

## Evaluating the performance of the model on the testing dataset over here

In [18]:
y_pred=regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),1))

[[15354. 15420.]
 [76980. 77100.]
 [79160. 79600.]
 ...
 [15214. 15260.]
 [11922. 11980.]
 [25670. 25640.]]


## Printing the metrics and the difference between the actual vs predicted value over here

In [19]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.9998230948879085