In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error,mean_squared_error
import matplotlib.pyplot as plt
from xgboost import XGBRegressor, plot_importance
from sklearn.model_selection import GridSearchCV
import seaborn as sns
from sklearn.feature_selection import SelectKBest, f_regression

In [15]:
cola_stock = pd.read_csv("/Users/julius/Personal/Personal_Project/Coca_cola_Project/Analysis_of_Coca_Cola_Stock_Market/Dataset/archive/KO_1919-09-06_2025-04-06.csv")

In [16]:
cola_stock

Unnamed: 0,date,open,high,low,close,adj_close,volume
0,1962-01-02 00:00:00-05:00,0.263021,0.270182,0.263021,0.263021,0.046041,806400
1,1962-01-03 00:00:00-05:00,0.259115,0.259115,0.253255,0.257161,0.045016,1574400
2,1962-01-04 00:00:00-05:00,0.257813,0.261068,0.257813,0.259115,0.045358,844800
3,1962-01-05 00:00:00-05:00,0.259115,0.262370,0.252604,0.253255,0.044332,1420800
4,1962-01-08 00:00:00-05:00,0.251302,0.251302,0.245768,0.250651,0.043876,2035200
...,...,...,...,...,...,...,...
15917,2025-03-31 00:00:00-04:00,70.730003,71.940002,70.489998,71.620003,71.620003,24016900
15918,2025-04-01 00:00:00-04:00,71.680000,71.910004,71.190002,71.870003,71.870003,15309100
15919,2025-04-02 00:00:00-04:00,71.959999,72.360001,71.050003,71.330002,71.330002,14606500
15920,2025-04-03 00:00:00-04:00,72.480003,73.949997,72.320000,73.180000,73.180000,24959600


## Dataset Information

Row: 15930 rows

columns: 7

Columns:
 - **date:** Date of trading
 - **open:** Opening price of the day
 - **high:** Highest price of the day
 - **low:** Lowest price of the day
 - **close:** Closing price of the day
 - **adj_close:** Adjusted closing price (accounts for splits/dividends)
 - **volume:** Total shares traded on the day 

# EDA Exploratory Data Analysis

### Checking the content of the dataset and looking for missing values,outlier and skewness

In [23]:
cola_stock.shape

(15922, 6)

In [24]:
cola_stock.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15922 entries, 1962-01-02 00:00:00-05:00 to 2025-04-04 00:00:00-04:00
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   open       15922 non-null  float64
 1   high       15922 non-null  float64
 2   low        15922 non-null  float64
 3   close      15922 non-null  float64
 4   adj_close  15922 non-null  float64
 5   volume     15922 non-null  int64  
dtypes: float64(5), int64(1)
memory usage: 870.7+ KB


In [25]:
cola_stock.isnull().sum()

open         0
high         0
low          0
close        0
adj_close    0
volume       0
dtype: int64

In [26]:
cola_stock.describe()

Unnamed: 0,open,high,low,close,adj_close,volume
count,15922.0,15922.0,15922.0,15922.0,15922.0,15922.0
mean,18.450788,18.598529,18.299819,18.455364,12.831326,9334420.0
std,19.432024,19.570869,19.28891,19.434456,16.645812,7954781.0
min,0.192708,0.193359,0.182292,0.192057,0.034086,76800.0
25%,0.888021,0.895833,0.880208,0.888021,0.223808,3111600.0
50%,10.53125,10.625,10.46875,10.53125,4.824283,8087650.0
75%,31.946876,32.355938,31.591562,31.96875,17.404522,13295480.0
max,73.300003,73.949997,72.32,73.18,73.18,124169000.0


In [27]:
#changing the datatype of datet
cola_stock['date'] = pd.to_datetime(cola_stock['date'])
cola_stock.set_index('date', inplace=True)

KeyError: 'date'

In [28]:
cola_stock

Unnamed: 0_level_0,open,high,low,close,adj_close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1962-01-02 00:00:00-05:00,0.263021,0.270182,0.263021,0.263021,0.046041,806400
1962-01-03 00:00:00-05:00,0.259115,0.259115,0.253255,0.257161,0.045016,1574400
1962-01-04 00:00:00-05:00,0.257813,0.261068,0.257813,0.259115,0.045358,844800
1962-01-05 00:00:00-05:00,0.259115,0.262370,0.252604,0.253255,0.044332,1420800
1962-01-08 00:00:00-05:00,0.251302,0.251302,0.245768,0.250651,0.043876,2035200
...,...,...,...,...,...,...
2025-03-31 00:00:00-04:00,70.730003,71.940002,70.489998,71.620003,71.620003,24016900
2025-04-01 00:00:00-04:00,71.680000,71.910004,71.190002,71.870003,71.870003,15309100
2025-04-02 00:00:00-04:00,71.959999,72.360001,71.050003,71.330002,71.330002,14606500
2025-04-03 00:00:00-04:00,72.480003,73.949997,72.320000,73.180000,73.180000,24959600
