# Problem Statement: The objective of this project is to develop a machine learning model to predict stock price movements or target a specific column like Close (which is often the predicted value in stock-related data).

# Data Preprocessing

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [2]:
 # Load the dataset
data=pd.read_csv("World-Stock-Prices-Dataset.csv")
data

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Brand_Name,Ticker,Industry_Tag,Country,Capital Gains
0,2024-10-04 00:00:00-04:00,4.790000,4.885000,4.600000,4.700000,13553417.0,0.0,0.0,peloton,PTON,fitness,usa,
1,2024-10-04 00:00:00-04:00,140.860001,142.740005,140.279999,141.929993,813600.0,0.0,0.0,crocs,CROX,footwear,usa,
2,2024-10-04 00:00:00-04:00,69.959999,70.279999,69.720001,70.169998,12684800.0,0.0,0.0,the coca-cola company,KO,food & beverage,usa,
3,2024-10-04 00:00:00-04:00,713.320007,721.010010,708.820007,719.700012,2229000.0,0.0,0.0,netflix,NFLX,entertainment,usa,
4,2024-10-04 00:00:00-04:00,24.400000,24.900000,23.990000,24.110001,2774500.0,0.0,0.0,foot locker,FL,footwear,usa,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
296750,2000-01-03 00:00:00-05:00,34.729481,34.729481,32.934947,33.198849,6471267.0,0.0,0.0,american express,AXP,finance,usa,
296751,2000-01-03 00:00:00-05:00,22.361069,22.431057,21.801168,22.186100,4520600.0,0.0,0.0,mcdonald's,MCD,food,usa,
296752,2000-01-03 00:00:00-05:00,11.826695,11.850113,11.405149,11.428568,2506717.0,0.0,0.0,marriott,MAR,hospitality,usa,
296753,2000-01-03 00:00:00-05:00,4.075000,4.478125,3.952344,4.468750,322352000.0,0.0,0.0,amazon,AMZN,e-commerce,usa,


In [3]:
#Finding the information of dataframe
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 296755 entries, 0 to 296754
Data columns (total 13 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           296755 non-null  object 
 1   Open           296755 non-null  float64
 2   High           296755 non-null  float64
 3   Low            296755 non-null  float64
 4   Close          296755 non-null  float64
 5   Volume         296755 non-null  float64
 6   Dividends      296755 non-null  float64
 7   Stock Splits   296755 non-null  float64
 8   Brand_Name     296755 non-null  object 
 9   Ticker         296755 non-null  object 
 10  Industry_Tag   296755 non-null  object 
 11  Country        296755 non-null  object 
 12  Capital Gains  2 non-null       float64
dtypes: float64(8), object(5)
memory usage: 29.4+ MB


In [4]:
# Dropping the column 'Capital Gains' that would not help me in my prediction purpose
data_cleaned = data.drop(['Capital Gains'], axis=1,inplace=True)

In [5]:
# Check for missing values
data.isnull().sum()

Date            0
Open            0
High            0
Low             0
Close           0
Volume          0
Dividends       0
Stock Splits    0
Brand_Name      0
Ticker          0
Industry_Tag    0
Country         0
dtype: int64

# Column Names and its Descriptions

In [6]:
#Descriptive Statistics of dataframe
data.describe()

Unnamed: 0,Open,High,Low,Close,Volume,Dividends,Stock Splits
count,296755.0,296755.0,296755.0,296755.0,296755.0,296755.0,296755.0
mean,71.68597,72.497539,70.853581,71.69385,23098560.0,0.003592,0.000914
std,136.759681,138.221655,135.201411,136.736214,87195600.0,0.069749,0.118005
min,0.19689,0.199922,0.192798,0.198861,0.0,0.0,0.0
25%,15.5391,15.747868,15.316266,15.54,1360400.0,0.0,0.0
50%,34.0602,34.46915,33.665442,34.072231,4594300.0,0.0,0.0
75%,78.343005,79.110613,77.611071,78.358852,12194350.0,0.0,0.0
max,3445.580078,3463.070068,3370.0,3427.610107,7421641000.0,15.0,50.0


# Graphs

In [7]:
# sns.set()
# plt.figure(figsize=(6,6))
# sns.countplot(data['Open'])
# plt.title('Open Prices Distribution')
# plt.show()

In [8]:
# plt.figure(figsize=(5,5))
# sns.distplot(x='Open',d=data)
# plt.title('Distribution of Open')
# plt.show()

In [9]:
# plt.figure(figsize=(5,5))
# sns.lineplot(x='High',d=data)
# plt.title('Distribution of High')
# plt.show()

In [10]:
# plt.figure(figsize=(5,5))
# sns.countplot(x='Low',d=data)
# plt.title('Distribution of Low')
# # plt.show()

In [11]:
# plt.figure(figsize=(5,5))
# sns.distplot(x='Volume',x=data)
# # plt.title('Distribution of Volume')
# plt.show()

In [12]:
# plt.figure(figsize=(5,5))
# sns.countplot(x='Dividends',d=data)
# plt.title('Distribution of Dividends')
# plt.show()

# Splitting Features(X) and Target(Y)

In [13]:
X = data.drop('Close', axis=1)  # Features
y = data['Close']  # Target

In [14]:
X

Unnamed: 0,Date,Open,High,Low,Volume,Dividends,Stock Splits,Brand_Name,Ticker,Industry_Tag,Country
0,2024-10-04 00:00:00-04:00,4.790000,4.885000,4.600000,13553417.0,0.0,0.0,peloton,PTON,fitness,usa
1,2024-10-04 00:00:00-04:00,140.860001,142.740005,140.279999,813600.0,0.0,0.0,crocs,CROX,footwear,usa
2,2024-10-04 00:00:00-04:00,69.959999,70.279999,69.720001,12684800.0,0.0,0.0,the coca-cola company,KO,food & beverage,usa
3,2024-10-04 00:00:00-04:00,713.320007,721.010010,708.820007,2229000.0,0.0,0.0,netflix,NFLX,entertainment,usa
4,2024-10-04 00:00:00-04:00,24.400000,24.900000,23.990000,2774500.0,0.0,0.0,foot locker,FL,footwear,usa
...,...,...,...,...,...,...,...,...,...,...,...
296750,2000-01-03 00:00:00-05:00,34.729481,34.729481,32.934947,6471267.0,0.0,0.0,american express,AXP,finance,usa
296751,2000-01-03 00:00:00-05:00,22.361069,22.431057,21.801168,4520600.0,0.0,0.0,mcdonald's,MCD,food,usa
296752,2000-01-03 00:00:00-05:00,11.826695,11.850113,11.405149,2506717.0,0.0,0.0,marriott,MAR,hospitality,usa
296753,2000-01-03 00:00:00-05:00,4.075000,4.478125,3.952344,322352000.0,0.0,0.0,amazon,AMZN,e-commerce,usa


In [15]:
y

0           4.700000
1         141.929993
2          70.169998
3         719.700012
4          24.110001
             ...    
296750     33.198849
296751     22.186100
296752     11.428568
296753      4.468750
296754      5.482471
Name: Close, Length: 296755, dtype: float64

In [16]:
print(X.columns)

Index(['Date', 'Open', 'High', 'Low', 'Volume', 'Dividends', 'Stock Splits',
       'Brand_Name', 'Ticker', 'Industry_Tag', 'Country'],
      dtype='object')


# Encoding the Categorical Features Numerical using Ordinal Encoder 

In [17]:
cat_col=X.select_dtypes(object).columns

In [18]:
X[cat_col]

Unnamed: 0,Date,Brand_Name,Ticker,Industry_Tag,Country
0,2024-10-04 00:00:00-04:00,peloton,PTON,fitness,usa
1,2024-10-04 00:00:00-04:00,crocs,CROX,footwear,usa
2,2024-10-04 00:00:00-04:00,the coca-cola company,KO,food & beverage,usa
3,2024-10-04 00:00:00-04:00,netflix,NFLX,entertainment,usa
4,2024-10-04 00:00:00-04:00,foot locker,FL,footwear,usa
...,...,...,...,...,...
296750,2000-01-03 00:00:00-05:00,american express,AXP,finance,usa
296751,2000-01-03 00:00:00-05:00,mcdonald's,MCD,food,usa
296752,2000-01-03 00:00:00-05:00,marriott,MAR,hospitality,usa
296753,2000-01-03 00:00:00-05:00,amazon,AMZN,e-commerce,usa


In [19]:
from sklearn.preprocessing import OrdinalEncoder
oe=OrdinalEncoder()
X[cat_col]=oe.fit_transform(X[cat_col])

In [20]:
X[cat_col]

Unnamed: 0,Date,Brand_Name,Ticker,Industry_Tag,Country
0,6227.0,37.0,46.0,9.0,6.0
1,6227.0,16.0,14.0,12.0,6.0
2,6227.0,51.0,28.0,11.0,6.0
3,6227.0,32.0,37.0,6.0,6.0
4,6227.0,19.0,19.0,12.0,6.0
...,...,...,...,...,...
296750,0.0,7.0,7.0,7.0,6.0
296751,0.0,30.0,34.0,10.0,6.0
296752,0.0,28.0,33.0,15.0,6.0
296753,0.0,4.0,6.0,5.0,6.0


In [21]:
X.head()

Unnamed: 0,Date,Open,High,Low,Volume,Dividends,Stock Splits,Brand_Name,Ticker,Industry_Tag,Country
0,6227.0,4.79,4.885,4.6,13553417.0,0.0,0.0,37.0,46.0,9.0,6.0
1,6227.0,140.860001,142.740005,140.279999,813600.0,0.0,0.0,16.0,14.0,12.0,6.0
2,6227.0,69.959999,70.279999,69.720001,12684800.0,0.0,0.0,51.0,28.0,11.0,6.0
3,6227.0,713.320007,721.01001,708.820007,2229000.0,0.0,0.0,32.0,37.0,6.0,6.0
4,6227.0,24.4,24.9,23.99,2774500.0,0.0,0.0,19.0,19.0,12.0,6.0


In [22]:
print(X.shape)

(296755, 11)


# Splitting the Data into Training & Testing data

In [23]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.10,random_state=42)

In [24]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(267079, 11) (29676, 11) (267079,) (29676,)


In [25]:
X_train.head()

Unnamed: 0,Date,Open,High,Low,Volume,Dividends,Stock Splits,Brand_Name,Ticker,Industry_Tag,Country
42526,5550.0,148.555953,149.452393,143.77168,233700.0,0.0,0.0,27.0,31.0,17.0,1.0
289330,218.0,12.876318,12.876318,12.682604,29600.0,0.0,0.0,23.0,23.0,1.0,3.0
43936,5527.0,206.323601,206.994612,204.12311,9434400.0,0.0,0.0,58.0,58.0,7.0,6.0
166199,3230.0,24.855862,25.049168,24.823644,260600.0,0.0,0.0,23.0,23.0,1.0,3.0
295974,22.0,5.872227,5.965437,5.856692,1426500.0,0.0,0.0,57.0,57.0,3.0,4.0


# Model Training

In [26]:
# step-1: Import the model
from sklearn.linear_model import LinearRegression

#Step-2: Create the instance of the model
lr=LinearRegression()

#Step-3:Train the model
lr.fit(X_train,y_train)

#Step-4: Predict the model
ypred=lr.predict(X_test)

# Evaluate the Model

In [27]:
#Accuracy
from sklearn.metrics import r2_score
r2_score(y_test,ypred)

0.9999446743407918

In [28]:
# Errors
from sklearn.metrics import mean_absolute_error,mean_squared_error

In [29]:
mean_absolute_error(y_test,ypred)

0.36424641789396106

In [30]:
mse=mean_squared_error(y_test,ypred)
mse

1.1208694197903712

In [31]:
np.sqrt(mse)

1.0587112069825138

# Predicting the Accuracy using LinearRegression

In [32]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train,y_train)
ypred = lr.predict(X_test)
r2_score(y_test,ypred)

0.9999446743407918

In [33]:
ypred

array([293.09540236,  36.88068949,   7.00088743, ...,  14.04831223,
        18.50986842, 184.260821  ])

# Predicting the Accuracy using Decision Tree

In [34]:
from sklearn.tree import DecisionTreeRegressor

In [35]:
from sklearn import tree
dt=DecisionTreeRegressor()

In [36]:
def mymodel(model):
 model.fit(X_train,y_train)
 ypred = model.predict(X_test)
 print(r2_score(y_test,ypred))
 
 return model

In [37]:
mymodel(dt)

0.999807547188782


In [38]:
# Max Depth
for i in range(10,25):
 dt1 = DecisionTreeRegressor(max_depth = i)
 dt1.fit(X_train,y_train)
 ypred = dt1.predict(X_test)
 ac = r2_score(y_test,ypred)
 print(f"max_depth = {i} accuracy : {ac}")

max_depth = 10 accuracy : 0.9998514533716221
max_depth = 11 accuracy : 0.9998173833979714
max_depth = 12 accuracy : 0.9998272395524634
max_depth = 13 accuracy : 0.9998259648794705
max_depth = 14 accuracy : 0.9998240828055446
max_depth = 15 accuracy : 0.9998210841393235
max_depth = 16 accuracy : 0.9998601223864335
max_depth = 17 accuracy : 0.9998157836837103
max_depth = 18 accuracy : 0.9998071434464306
max_depth = 19 accuracy : 0.9998542612137801
max_depth = 20 accuracy : 0.9998123188631475
max_depth = 21 accuracy : 0.9998535261298813
max_depth = 22 accuracy : 0.999808836529629
max_depth = 23 accuracy : 0.9998494602900486
max_depth = 24 accuracy : 0.9998486012538783


In [39]:
dt2 = DecisionTreeRegressor(max_depth =24)
mymodel(dt2)

0.9998111313315903


# Predicting the Accuracy using Random Forest

In [40]:
from sklearn.ensemble import RandomForestRegressor
rc = RandomForestRegressor()
rc.fit(X_train,y_train)
ypred = rc.predict(X_test)
print(r2_score(y_test,ypred))

0.9998930790911981


# Predicting the data using Gradient Boosting Regressor

In [41]:
from sklearn.ensemble import GradientBoostingRegressor
gbc = GradientBoostingRegressor()
gbc.fit(X_train,y_train)
ypred = gbc.predict(X_test)
print(r2_score(y_test,ypred))

0.999705197373301


# Predicting the data using XGBRegressor

In [42]:
from xgboost import XGBRegressor
xgb = XGBRegressor()
xgb.fit(X_train,y_train)
ypred = xgb.predict(X_test)
print(r2_score(y_test,ypred))

0.9989124454970262


# Conclusion 

# Based on the above accuracy scores, we should go ahead with Decision Tree and Randomforest