## Importing the essential libraries over here

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

## Importing the dataset over here

In [3]:
data=pd.read_csv("Microsoft Dataset.csv")

In [4]:
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,1986-03-13,0.088542,0.101563,0.088542,0.097222,0.060055,1031788800
1,1986-03-14,0.097222,0.102431,0.097222,0.100694,0.062199,308160000
2,1986-03-17,0.100694,0.103299,0.100694,0.102431,0.063272,133171200
3,1986-03-18,0.102431,0.103299,0.098958,0.099826,0.061663,67766400
4,1986-03-19,0.099826,0.100694,0.097222,0.09809,0.060591,47894400


## Taking care of duplicate observations if present over here

In [5]:
data.drop_duplicates(inplace=True)

## Taking care of misssing values if present over here

In [6]:
data.isnull().sum()

Date         0
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64

## Filtering all the numerical features here

In [8]:
numerical_features=[feature for feature in data.columns if data[feature].dtype!='O']
for feature in numerical_features:
  print(feature)

Open
High
Low
Close
Adj Close
Volume


In [9]:
data[numerical_features]

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
0,0.088542,0.101563,0.088542,0.097222,0.060055,1031788800
1,0.097222,0.102431,0.097222,0.100694,0.062199,308160000
2,0.100694,0.103299,0.100694,0.102431,0.063272,133171200
3,0.102431,0.103299,0.098958,0.099826,0.061663,67766400
4,0.099826,0.100694,0.097222,0.098090,0.060591,47894400
...,...,...,...,...,...,...
9627,427.190002,431.059998,424.410004,430.160004,430.160004,11845800
9628,429.630005,430.820007,426.600006,430.320007,430.320007,15718000
9629,425.690002,430.940002,425.690002,429.170013,429.170013,15517100
9630,424.299988,424.299988,414.239990,414.670013,414.670013,28394500


## Filtering all the categorical features over here

In [10]:
cat_features=[feature for feature in data.columns if data[feature].dtype=='O']
for feature in cat_features:
  print(feature)

Date


## Encoding the categorical feature over here

In [12]:
data['Date'].value_counts()

Date
1986-03-13    1
2011-08-18    1
2011-08-22    1
2011-08-23    1
2011-08-24    1
             ..
1998-11-23    1
1998-11-24    1
1998-11-25    1
1998-11-27    1
2024-05-31    1
Name: count, Length: 9632, dtype: int64

In [13]:
Date_mapping={}
for index,Date in enumerate(data['Date'].unique()):
  Date_mapping[Date]=index
print(Date_mapping)

{'1986-03-13': 0, '1986-03-14': 1, '1986-03-17': 2, '1986-03-18': 3, '1986-03-19': 4, '1986-03-20': 5, '1986-03-21': 6, '1986-03-24': 7, '1986-03-25': 8, '1986-03-26': 9, '1986-03-27': 10, '1986-03-31': 11, '1986-04-01': 12, '1986-04-02': 13, '1986-04-03': 14, '1986-04-04': 15, '1986-04-07': 16, '1986-04-08': 17, '1986-04-09': 18, '1986-04-10': 19, '1986-04-11': 20, '1986-04-14': 21, '1986-04-15': 22, '1986-04-16': 23, '1986-04-17': 24, '1986-04-18': 25, '1986-04-21': 26, '1986-04-22': 27, '1986-04-23': 28, '1986-04-24': 29, '1986-04-25': 30, '1986-04-28': 31, '1986-04-29': 32, '1986-04-30': 33, '1986-05-01': 34, '1986-05-02': 35, '1986-05-05': 36, '1986-05-06': 37, '1986-05-07': 38, '1986-05-08': 39, '1986-05-09': 40, '1986-05-12': 41, '1986-05-13': 42, '1986-05-14': 43, '1986-05-15': 44, '1986-05-16': 45, '1986-05-19': 46, '1986-05-20': 47, '1986-05-21': 48, '1986-05-22': 49, '1986-05-23': 50, '1986-05-27': 51, '1986-05-28': 52, '1986-05-29': 53, '1986-05-30': 54, '1986-06-02': 55, '

In [14]:
data['Date']=data['Date'].map(Date_mapping)

In [15]:
data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,0,0.088542,0.101563,0.088542,0.097222,0.060055,1031788800
1,1,0.097222,0.102431,0.097222,0.100694,0.062199,308160000
2,2,0.100694,0.103299,0.100694,0.102431,0.063272,133171200
3,3,0.102431,0.103299,0.098958,0.099826,0.061663,67766400
4,4,0.099826,0.100694,0.097222,0.098090,0.060591,47894400
...,...,...,...,...,...,...,...
9627,9627,427.190002,431.059998,424.410004,430.160004,430.160004,11845800
9628,9628,429.630005,430.820007,426.600006,430.320007,430.320007,15718000
9629,9629,425.690002,430.940002,425.690002,429.170013,429.170013,15517100
9630,9630,424.299988,424.299988,414.239990,414.670013,414.670013,28394500


In [16]:
data['OPEN']=data['Open']

In [17]:
data=data.drop("Open",axis=1)

In [18]:
data

Unnamed: 0,Date,High,Low,Close,Adj Close,Volume,OPEN
0,0,0.101563,0.088542,0.097222,0.060055,1031788800,0.088542
1,1,0.102431,0.097222,0.100694,0.062199,308160000,0.097222
2,2,0.103299,0.100694,0.102431,0.063272,133171200,0.100694
3,3,0.103299,0.098958,0.099826,0.061663,67766400,0.102431
4,4,0.100694,0.097222,0.098090,0.060591,47894400,0.099826
...,...,...,...,...,...,...,...
9627,9627,431.059998,424.410004,430.160004,430.160004,11845800,427.190002
9628,9628,430.820007,426.600006,430.320007,430.320007,15718000,429.630005
9629,9629,430.940002,425.690002,429.170013,429.170013,15517100,425.690002
9630,9630,424.299988,414.239990,414.670013,414.670013,28394500,424.299988


## Creating the features and labels over here


In [19]:
X=data.iloc[:,:-1].values
y=data.iloc[:,-1].values

## Splitting the dataset into training set and testing set over here to avoid the problem of overfitting

In [20]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [22]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

# Assuming X_train, X_test, y_train, y_test are already defined

# Initialize regressors
regressors = {
    "Random Forest": RandomForestRegressor(),
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "Support Vector Machine": SVR(),
    "K-Nearest Neighbors": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "XGBoost": XGBRegressor(),
}

# Train and evaluate each regressor
results = {}
for name, regressor in regressors.items():
    regressor.fit(X_train, y_train)
    y_pred = regressor.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    results[name] = mse

# Select the regressor with the lowest mean squared error (MSE)
best_regressor = min(results, key=results.get)
lowest_mse = results[best_regressor]

print("Best Regressor:", best_regressor)
print("Mean Squared Error:", lowest_mse)

Best Regressor: Linear Regression
Mean Squared Error: 0.3131160219460679


In [23]:
regressor=LinearRegression()
regressor.fit(X_train,y_train)

In [24]:
y_pred=regressor.predict((X_test))
np.set_printoptions(precision=2)
print(np.concatenate((y_test.reshape(len(y_test),1),y_pred.reshape(len(y_pred),1)),1))

[[29.6  29.41]
 [ 3.35  3.36]
 [16.8  16.88]
 ...
 [19.81 20.13]
 [31.89 31.69]
 [ 2.73  2.73]]


In [25]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.9999584573022716