## Importing the essential libraries over here

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Importing the dataset over here

In [2]:
data=pd.read_csv("AAPL.csv")

In [3]:
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,1980-12-12,0.128348,0.128906,0.128348,0.128348,0.100178,469033600
1,1980-12-15,0.12221,0.12221,0.121652,0.121652,0.094952,175884800
2,1980-12-16,0.113281,0.113281,0.112723,0.112723,0.087983,105728000
3,1980-12-17,0.115513,0.116071,0.115513,0.115513,0.09016,86441600
4,1980-12-18,0.118862,0.11942,0.118862,0.118862,0.092774,73449600


## Taking care of missing values if present over here

In [4]:
data.isnull().sum()

Date         0
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64

## Taking care of duplicates over here

In [5]:
data.duplicated().sum()

0

## Filtering the numerical features over here

In [6]:
numerical_features=[feature for feature in data.columns if data[feature].dtype!='O']
for feature in numerical_features:
  print(feature)

Open
High
Low
Close
Adj Close
Volume


In [7]:
data[numerical_features]

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
0,0.128348,0.128906,0.128348,0.128348,0.100178,469033600
1,0.122210,0.122210,0.121652,0.121652,0.094952,175884800
2,0.113281,0.113281,0.112723,0.112723,0.087983,105728000
3,0.115513,0.116071,0.115513,0.115513,0.090160,86441600
4,0.118862,0.119420,0.118862,0.118862,0.092774,73449600
...,...,...,...,...,...,...
10463,132.869995,135.199997,131.440002,131.880005,131.880005,122207100
10464,133.130005,133.889999,131.479996,132.759995,132.759995,84784300
10465,134.289993,137.339996,132.160004,135.429993,135.429993,91533000
10466,132.080002,132.389999,129.039993,130.059998,130.059998,108123900


## Filtering all the categorical features over here

In [8]:
cat_features=[feature for feature in data.columns if data[feature].dtype=='O']
for cat_feature in cat_features:
  print(cat_feature)

Date


# Encoding the categorical features into numerical features over here

In [9]:
for feature in cat_features:
  feature_mapping={category:index for index,category in enumerate(data[feature].unique())}
  data[feature]=data[feature].map(feature_mapping)

In [10]:
data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,0,0.128348,0.128906,0.128348,0.128348,0.100178,469033600
1,1,0.122210,0.122210,0.121652,0.121652,0.094952,175884800
2,2,0.113281,0.113281,0.112723,0.112723,0.087983,105728000
3,3,0.115513,0.116071,0.115513,0.115513,0.090160,86441600
4,4,0.118862,0.119420,0.118862,0.118862,0.092774,73449600
...,...,...,...,...,...,...,...
10463,10463,132.869995,135.199997,131.440002,131.880005,131.880005,122207100
10464,10464,133.130005,133.889999,131.479996,132.759995,132.759995,84784300
10465,10465,134.289993,137.339996,132.160004,135.429993,135.429993,91533000
10466,10466,132.080002,132.389999,129.039993,130.059998,130.059998,108123900


In [11]:
data['CLOSE']=data['Close']
data.drop('Close',axis=1,inplace=True)

## Creating the features into labels over here

In [12]:
X=data.iloc[:,:-1].values
y=data.iloc[:,-1].values

## Splitting the dataset into training set and testing set to avoid the problem of overfitting over here

In [13]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

## Training the model on the training dataset over here

In [14]:
from sklearn.ensemble import RandomForestRegressor
regressor=RandomForestRegressor(n_estimators=10,random_state=0)
regressor.fit(X_train,y_train)

## Evaluating the performance of the testing set over here

In [15]:
y_pred=regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),1))

[[  0.24   0.24]
 [ 54.02  53.87]
 [  5.94   5.94]
 ...
 [132.48 132.54]
 [  6.01   6.01]
 [  0.19   0.19]]


In [16]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.9999818633506238