## Importing the essential libraries over here

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Importing the dataset over here

In [2]:
data=pd.read_csv("abnb_stock_data.csv")

In [3]:
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2020-12-10,146.0,165.0,141.25,144.710007,144.710007,70447500
1,2020-12-11,146.550003,151.5,135.100006,139.25,139.25,26980800
2,2020-12-14,135.0,135.300003,125.160004,130.0,130.0,16966100
3,2020-12-15,126.690002,127.599998,121.5,124.800003,124.800003,10914400
4,2020-12-16,125.830002,142.0,124.910004,137.990005,137.990005,20409600


## Taking care of duplicate observations if present over here

In [4]:
data.drop_duplicates(inplace=True)

## Taking care of misssing values if present over here

In [5]:
data.isnull().sum()

Date         0
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64

In [6]:
missing_values=[feature for feature in data.columns if data[feature].isnull().sum()>1]
for feature in missing_values:
  print(feature)

## Filtering all the numerical features here

In [7]:
numerical_features=[feature for feature in data.columns if data[feature].dtype!='O']
for feature in numerical_features:
  print(feature)

Open
High
Low
Close
Adj Close
Volume


In [8]:
data[numerical_features]

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
0,146.000000,165.000000,141.250000,144.710007,144.710007,70447500
1,146.550003,151.500000,135.100006,139.250000,139.250000,26980800
2,135.000000,135.300003,125.160004,130.000000,130.000000,16966100
3,126.690002,127.599998,121.500000,124.800003,124.800003,10914400
4,125.830002,142.000000,124.910004,137.990005,137.990005,20409600
...,...,...,...,...,...,...
804,153.270004,155.300003,150.824997,152.660004,152.660004,4851100
805,149.960007,150.179993,148.559998,149.270004,149.270004,5277900
806,150.000000,152.470001,149.229996,152.059998,152.059998,4623700
807,152.029999,154.899994,151.789993,153.429993,153.429993,4329100


## Filtering all the categorical features over here

In [9]:
cat_features=[feature for feature in data.columns if data[feature].dtype=='O']
for feature in cat_features:
  print(feature)

Date


## Encoding the categorical feature over here

In [10]:
data['Date'].value_counts()

Date
2020-12-10    1
2023-02-08    1
2023-01-25    1
2023-01-26    1
2023-01-27    1
             ..
2022-01-11    1
2022-01-12    1
2022-01-13    1
2022-01-14    1
2024-02-29    1
Name: count, Length: 809, dtype: int64

In [11]:
Date_mapping={}
for index,Date in enumerate(data['Date'].unique()):
  Date_mapping[Date]=index
print(Date_mapping)

{'2020-12-10': 0, '2020-12-11': 1, '2020-12-14': 2, '2020-12-15': 3, '2020-12-16': 4, '2020-12-17': 5, '2020-12-18': 6, '2020-12-21': 7, '2020-12-22': 8, '2020-12-23': 9, '2020-12-24': 10, '2020-12-28': 11, '2020-12-29': 12, '2020-12-30': 13, '2020-12-31': 14, '2021-01-04': 15, '2021-01-05': 16, '2021-01-06': 17, '2021-01-07': 18, '2021-01-08': 19, '2021-01-11': 20, '2021-01-12': 21, '2021-01-13': 22, '2021-01-14': 23, '2021-01-15': 24, '2021-01-19': 25, '2021-01-20': 26, '2021-01-21': 27, '2021-01-22': 28, '2021-01-25': 29, '2021-01-26': 30, '2021-01-27': 31, '2021-01-28': 32, '2021-01-29': 33, '2021-02-01': 34, '2021-02-02': 35, '2021-02-03': 36, '2021-02-04': 37, '2021-02-05': 38, '2021-02-08': 39, '2021-02-09': 40, '2021-02-10': 41, '2021-02-11': 42, '2021-02-12': 43, '2021-02-16': 44, '2021-02-17': 45, '2021-02-18': 46, '2021-02-19': 47, '2021-02-22': 48, '2021-02-23': 49, '2021-02-24': 50, '2021-02-25': 51, '2021-02-26': 52, '2021-03-01': 53, '2021-03-02': 54, '2021-03-03': 55, '

In [12]:
data['Date']=data['Date'].map(Date_mapping)

In [13]:
data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,0,146.000000,165.000000,141.250000,144.710007,144.710007,70447500
1,1,146.550003,151.500000,135.100006,139.250000,139.250000,26980800
2,2,135.000000,135.300003,125.160004,130.000000,130.000000,16966100
3,3,126.690002,127.599998,121.500000,124.800003,124.800003,10914400
4,4,125.830002,142.000000,124.910004,137.990005,137.990005,20409600
...,...,...,...,...,...,...,...
804,804,153.270004,155.300003,150.824997,152.660004,152.660004,4851100
805,805,149.960007,150.179993,148.559998,149.270004,149.270004,5277900
806,806,150.000000,152.470001,149.229996,152.059998,152.059998,4623700
807,807,152.029999,154.899994,151.789993,153.429993,153.429993,4329100


In [14]:
data['OPEN']=data['Open']

In [15]:
data=data.drop("Open",axis=1)

In [16]:
data

Unnamed: 0,Date,High,Low,Close,Adj Close,Volume,OPEN
0,0,165.000000,141.250000,144.710007,144.710007,70447500,146.000000
1,1,151.500000,135.100006,139.250000,139.250000,26980800,146.550003
2,2,135.300003,125.160004,130.000000,130.000000,16966100,135.000000
3,3,127.599998,121.500000,124.800003,124.800003,10914400,126.690002
4,4,142.000000,124.910004,137.990005,137.990005,20409600,125.830002
...,...,...,...,...,...,...,...
804,804,155.300003,150.824997,152.660004,152.660004,4851100,153.270004
805,805,150.179993,148.559998,149.270004,149.270004,5277900,149.960007
806,806,152.470001,149.229996,152.059998,152.059998,4623700,150.000000
807,807,154.899994,151.789993,153.429993,153.429993,4329100,152.029999


## Creating the features and labels over here

In [17]:
X=data.iloc[:,:-1].values
y=data.iloc[:,-1].values

## Splitting the dataset into training set and testing set over here to avoid the problem of overfitting

In [18]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

# Assuming X_train, X_test, y_train, y_test are already defined

# Initialize regressors
regressors = {
    "Random Forest": RandomForestRegressor(),
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "Support Vector Machine": SVR(),
    "K-Nearest Neighbors": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "XGBoost": XGBRegressor(),
    "CatBoost": CatBoostRegressor(silent=True)  # silent=True to suppress CatBoost output
}

# Train and evaluate each regressor
results = {}
for name, regressor in regressors.items():
    regressor.fit(X_train, y_train)
    y_pred = regressor.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    results[name] = mse

# Select the regressor with the lowest mean squared error (MSE)
best_regressor = min(results, key=results.get)
lowest_mse = results[best_regressor]

print("Best Regressor:", best_regressor)
print("Mean Squared Error:", lowest_mse)


Best Regressor: Ridge Regression
Mean Squared Error: 4.7427013466601515


In [20]:
regressor=Ridge()
regressor.fit(X_train,y_train)

In [21]:
y_pred=regressor.predict((X_test))
np.set_printoptions(precision=2)
print(np.concatenate((y_test.reshape(len(y_test),1),y_pred.reshape(len(y_pred),1)),1))

[[108.34 109.42]
 [168.1  167.64]
 [186.97 185.35]
 [ 94.08  94.05]
 [164.99 162.65]
 [146.9  146.18]
 [109.19 108.05]
 [ 93.53  93.71]
 [168.98 166.25]
 [132.86 135.39]
 [105.6  105.05]
 [107.74 108.15]
 [170.   171.03]
 [170.2  169.52]
 [123.   121.49]
 [118.25 119.58]
 [110.96 110.95]
 [156.5  154.08]
 [137.4  136.62]
 [165.3  166.9 ]
 [126.15 125.63]
 [ 92.97  93.22]
 [191.66 190.47]
 [106.7  107.42]
 [114.09 112.08]
 [141.   139.68]
 [129.28 131.41]
 [134.   135.26]
 [150.08 151.64]
 [122.39 122.36]
 [172.69 173.93]
 [148.15 147.98]
 [171.99 170.53]
 [119.76 121.02]
 [121.5  121.44]
 [152.55 149.12]
 [144.9  145.47]
 [141.93 142.63]
 [ 92.49  90.47]
 [119.71 118.38]
 [153.82 154.97]
 [140.14 139.61]
 [189.38 188.21]
 [ 99.92 100.92]
 [129.45 130.71]
 [138.19 138.96]
 [113.95 114.85]
 [138.96 138.12]
 [187.64 200.76]
 [190.23 191.81]
 [111.19 111.61]
 [182.09 182.81]
 [152.19 149.88]
 [103.11 104.92]
 [177.19 176.77]
 [199.2  204.55]
 [ 97.52  97.04]
 [113.56 113.45]
 [118.34 118. 

In [22]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.9943294882041134