In [1]:
import pandas as pd
import numpy as np

## Handling Outliers

In [2]:
# Path to the dataset
file_path = "/home/moraa-ontita/Documents/Machine-learning/Stock_Price_Prediction/artifacts/notebooks/eda.csv"

# Load the dataset
df = pd.read_csv(file_path)

# Display initial rows to confirm structure
print("Initial Dataset:")
print(df.head())

Initial Dataset:
         Date    Adj Close        Close         High          Low  \
0  2000-01-03  1455.219971  1455.219971  1478.000000  1438.359985   
1  2000-01-04  1399.420044  1399.420044  1455.219971  1397.430054   
2  2000-01-05  1402.109985  1402.109985  1413.270020  1377.680054   
3  2000-01-06  1403.449951  1403.449951  1411.900024  1392.099976   
4  2000-01-07  1441.469971  1441.469971  1441.469971  1400.729980   

          Open      Volume  
0  1469.250000   931800000  
1  1455.219971  1009000000  
2  1399.420044  1085500000  
3  1402.109985  1092300000  
4  1403.449951  1225200000  


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5996 entries, 0 to 5995
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       5996 non-null   object 
 1   Adj Close  5996 non-null   float64
 2   Close      5996 non-null   float64
 3   High       5996 non-null   float64
 4   Low        5996 non-null   float64
 5   Open       5996 non-null   float64
 6   Volume     5996 non-null   int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 328.0+ KB


In [4]:
# Log transformation
columns_to_transform = ['Adj Close', 'Close', 'High', 'Low', 'Open', 'Volume']

# Adding a small constant to avoid log(0) and handle zero or negative values
constant = 1e-5
df[columns_to_transform] = df[columns_to_transform].apply(lambda x: np.log(x + constant))

# Display transformed dataset
print("\nTransformed Dataset:")
print(df.head())


Transformed Dataset:
         Date  Adj Close     Close      High       Low      Open     Volume
0  2000-01-03   7.282912  7.282912  7.298445  7.271259  7.292507  20.652629
1  2000-01-04   7.243813  7.243813  7.282912  7.242390  7.282912  20.732226
2  2000-01-05   7.245734  7.245734  7.253661  7.228156  7.243813  20.805307
3  2000-01-06   7.246689  7.246689  7.252692  7.238569  7.245734  20.811551
4  2000-01-07   7.273419  7.273419  7.273419  7.244749  7.246689  20.926370


In [5]:
# Save the transformed dataset back to a file (optional)
output_path = "/home/moraa-ontita/Documents/Machine-learning/Stock_Price_Prediction/artifacts/notebooks/eda_transformed.csv"
df.to_csv(output_path, index=False)
print(f"\nTransformed dataset saved to: {output_path}")


Transformed dataset saved to: /home/moraa-ontita/Documents/Machine-learning/Stock_Price_Prediction/artifacts/notebooks/eda_transformed.csv


## Feature Engineering

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5996 entries, 0 to 5995
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       5996 non-null   object 
 1   Adj Close  5996 non-null   float64
 2   Close      5996 non-null   float64
 3   High       5996 non-null   float64
 4   Low        5996 non-null   float64
 5   Open       5996 non-null   float64
 6   Volume     5996 non-null   float64
dtypes: float64(6), object(1)
memory usage: 328.0+ KB


In [7]:
# Ensure the Date column is in datetime format
df['Date'] = pd.to_datetime(df['Date'])

## Daily Returns

In [8]:
# Daily Returns
df['Close_return'] = np.log(df['Close']) - np.log(df['Close'].shift(1))
df['AdjClose_return'] = np.log(df['Adj Close']) - np.log(df['Adj Close'].shift(1))
df['High_return'] = np.log(df['High']) - np.log(df['High'].shift(1))
df['Low_return'] = np.log(df['Low']) - np.log(df['Low'].shift(1))
df['Return'] = np.log(df['Close']) - np.log(df['Close'].shift(1))
df.head()


Unnamed: 0,Date,Adj Close,Close,High,Low,Open,Volume,Close_return,AdjClose_return,High_return,Low_return,Return
0,2000-01-03,7.282912,7.282912,7.298445,7.271259,7.292507,20.652629,,,,,
1,2000-01-04,7.243813,7.243813,7.282912,7.24239,7.282912,20.732226,-0.005383,-0.005383,-0.00213,-0.003978,-0.005383
2,2000-01-05,7.245734,7.245734,7.253661,7.228156,7.243813,20.805307,0.000265,0.000265,-0.004024,-0.001967,0.000265
3,2000-01-06,7.246689,7.246689,7.252692,7.238569,7.245734,20.811551,0.000132,0.000132,-0.000134,0.001439,0.000132
4,2000-01-07,7.273419,7.273419,7.273419,7.244749,7.246689,20.92637,0.003682,0.003682,0.002854,0.000853,0.003682


In [9]:
# Percentage Changes
df['Close_pct_change'] = df['Close'].pct_change() * 100
df['AdjClose_pct_change'] = df['Adj Close'].pct_change() * 100
df['High_pct_change'] = df['High'].pct_change() * 100
df['Low_pct_change'] = df['Low'].pct_change() * 100
df.head()

Unnamed: 0,Date,Adj Close,Close,High,Low,Open,Volume,Close_return,AdjClose_return,High_return,Low_return,Return,Close_pct_change,AdjClose_pct_change,High_pct_change,Low_pct_change
0,2000-01-03,7.282912,7.282912,7.298445,7.271259,7.292507,20.652629,,,,,,,,,
1,2000-01-04,7.243813,7.243813,7.282912,7.24239,7.282912,20.732226,-0.005383,-0.005383,-0.00213,-0.003978,-0.005383,-0.536862,-0.536862,-0.212823,-0.397025
2,2000-01-05,7.245734,7.245734,7.253661,7.228156,7.243813,20.805307,0.000265,0.000265,-0.004024,-0.001967,0.000265,0.02651,0.02651,-0.401637,-0.196536
3,2000-01-06,7.246689,7.246689,7.252692,7.238569,7.245734,20.811551,0.000132,0.000132,-0.000134,0.001439,0.000132,0.013183,0.013183,-0.01337,0.144054
4,2000-01-07,7.273419,7.273419,7.273419,7.244749,7.246689,20.92637,0.003682,0.003682,0.002854,0.000853,0.003682,0.368857,0.368857,0.285785,0.085378


In [10]:
df.shape

(5996, 16)

In [11]:
# Drop NaN rows introduced by lag calculations
df = df.dropna()
df.head()

Unnamed: 0,Date,Adj Close,Close,High,Low,Open,Volume,Close_return,AdjClose_return,High_return,Low_return,Return,Close_pct_change,AdjClose_pct_change,High_pct_change,Low_pct_change
1,2000-01-04,7.243813,7.243813,7.282912,7.24239,7.282912,20.732226,-0.005383,-0.005383,-0.00213,-0.003978,-0.005383,-0.536862,-0.536862,-0.212823,-0.397025
2,2000-01-05,7.245734,7.245734,7.253661,7.228156,7.243813,20.805307,0.000265,0.000265,-0.004024,-0.001967,0.000265,0.02651,0.02651,-0.401637,-0.196536
3,2000-01-06,7.246689,7.246689,7.252692,7.238569,7.245734,20.811551,0.000132,0.000132,-0.000134,0.001439,0.000132,0.013183,0.013183,-0.01337,0.144054
4,2000-01-07,7.273419,7.273419,7.273419,7.244749,7.246689,20.92637,0.003682,0.003682,0.002854,0.000853,0.003682,0.368857,0.368857,0.285785,0.085378
5,2000-01-10,7.284547,7.284547,7.289174,7.273419,7.273419,20.786053,0.001529,0.001529,0.002164,0.00395,0.001529,0.152993,0.152993,0.216609,0.395733


In [12]:
df.shape

(5995, 16)

In [13]:
import os

# Define the path to the directory
features_dir = "/home/moraa-ontita/Documents/Machine-learning/Stock_Price_Prediction/artifacts/notebooks/features"

# Ensure the directory exists before saving the file
os.makedirs(features_dir, exist_ok=True)

# Set the output path for the returns and percentage changes dataset
output_path_returns_pct_change = os.path.join(features_dir, "01_eda_with_returns_pct_changes.csv")

# Save the dataframe to the new location
df.to_csv(output_path_returns_pct_change, index=False)

print(f"Dataset with returns and percentage changes saved to: {output_path_returns_pct_change}")


Dataset with returns and percentage changes saved to: /home/moraa-ontita/Documents/Machine-learning/Stock_Price_Prediction/artifacts/notebooks/features/01_eda_with_returns_pct_changes.csv


## Price Ratios

In [14]:
# Price Ratios
df['High_Low_ratio'] = df['High'] / df['Low']
df['Close_Open_ratio'] = df['Close'] / df['Open']
df['High_Close_ratio'] = df['High'] / df['Close']
df['Low_Open_ratio'] = df['Low'] / df['Open']
df.head()

Unnamed: 0,Date,Adj Close,Close,High,Low,Open,Volume,Close_return,AdjClose_return,High_return,Low_return,Return,Close_pct_change,AdjClose_pct_change,High_pct_change,Low_pct_change,High_Low_ratio,Close_Open_ratio,High_Close_ratio,Low_Open_ratio
1,2000-01-04,7.243813,7.243813,7.282912,7.24239,7.282912,20.732226,-0.005383,-0.005383,-0.00213,-0.003978,-0.005383,-0.536862,-0.536862,-0.212823,-0.397025,1.005595,0.994631,1.005398,0.994436
2,2000-01-05,7.245734,7.245734,7.253661,7.228156,7.243813,20.805307,0.000265,0.000265,-0.004024,-0.001967,0.000265,0.02651,0.02651,-0.401637,-0.196536,1.003529,1.000265,1.001094,0.997839
3,2000-01-06,7.246689,7.246689,7.252692,7.238569,7.245734,20.811551,0.000132,0.000132,-0.000134,0.001439,0.000132,0.013183,0.013183,-0.01337,0.144054,1.001951,1.000132,1.000828,0.999011
4,2000-01-07,7.273419,7.273419,7.273419,7.244749,7.246689,20.92637,0.003682,0.003682,0.002854,0.000853,0.003682,0.368857,0.368857,0.285785,0.085378,1.003957,1.003689,1.0,0.999732
5,2000-01-10,7.284547,7.284547,7.289174,7.273419,7.273419,20.786053,0.001529,0.001529,0.002164,0.00395,0.001529,0.152993,0.152993,0.216609,0.395733,1.002166,1.00153,1.000635,1.0


In [15]:
df.isnull().sum()

Date                   0
Adj Close              0
Close                  0
High                   0
Low                    0
Open                   0
Volume                 0
Close_return           0
AdjClose_return        0
High_return            0
Low_return             0
Return                 0
Close_pct_change       0
AdjClose_pct_change    0
High_pct_change        0
Low_pct_change         0
High_Low_ratio         0
Close_Open_ratio       0
High_Close_ratio       0
Low_Open_ratio         0
dtype: int64

In [16]:
# Set the output path for the price ratios dataset
output_path_price_ratios = os.path.join(features_dir, "02_eda_with_price_ratios.csv")

# Save the dataframe to the new location
df.to_csv(output_path_price_ratios, index=False)

print(f"Dataset with price ratios saved to: {output_path_price_ratios}")


Dataset with price ratios saved to: /home/moraa-ontita/Documents/Machine-learning/Stock_Price_Prediction/artifacts/notebooks/features/02_eda_with_price_ratios.csv


# Calculate the moving averages for Close

In [17]:
# Calculate the moving averages for Close
df['MA_5'] = df['Close'].rolling(window=5).mean()
df['MA_10'] = df['Close'].rolling(window=10).mean()
df['MA_20'] = df['Close'].rolling(window=20).mean()
df.head()

Unnamed: 0,Date,Adj Close,Close,High,Low,Open,Volume,Close_return,AdjClose_return,High_return,...,AdjClose_pct_change,High_pct_change,Low_pct_change,High_Low_ratio,Close_Open_ratio,High_Close_ratio,Low_Open_ratio,MA_5,MA_10,MA_20
1,2000-01-04,7.243813,7.243813,7.282912,7.24239,7.282912,20.732226,-0.005383,-0.005383,-0.00213,...,-0.536862,-0.212823,-0.397025,1.005595,0.994631,1.005398,0.994436,,,
2,2000-01-05,7.245734,7.245734,7.253661,7.228156,7.243813,20.805307,0.000265,0.000265,-0.004024,...,0.02651,-0.401637,-0.196536,1.003529,1.000265,1.001094,0.997839,,,
3,2000-01-06,7.246689,7.246689,7.252692,7.238569,7.245734,20.811551,0.000132,0.000132,-0.000134,...,0.013183,-0.01337,0.144054,1.001951,1.000132,1.000828,0.999011,,,
4,2000-01-07,7.273419,7.273419,7.273419,7.244749,7.246689,20.92637,0.003682,0.003682,0.002854,...,0.368857,0.285785,0.085378,1.003957,1.003689,1.0,0.999732,,,
5,2000-01-10,7.284547,7.284547,7.289174,7.273419,7.273419,20.786053,0.001529,0.001529,0.002164,...,0.152993,0.216609,0.395733,1.002166,1.00153,1.000635,1.0,7.25884,,


In [18]:
# Check the first few rows to verify
print(df[['Date', 'Close', 'MA_5', 'MA_10', 'MA_20']].head())


        Date     Close     MA_5  MA_10  MA_20
1 2000-01-04  7.243813      NaN    NaN    NaN
2 2000-01-05  7.245734      NaN    NaN    NaN
3 2000-01-06  7.246689      NaN    NaN    NaN
4 2000-01-07  7.273419      NaN    NaN    NaN
5 2000-01-10  7.284547  7.25884    NaN    NaN


In [19]:
df[['Date', 'Close', 'MA_5', 'MA_10', 'MA_20']].isnull().sum()

Date      0
Close     0
MA_5      4
MA_10     9
MA_20    19
dtype: int64

In [20]:
# Drop rows with NaN values in any of the moving averages
df = df.dropna(subset=['MA_5', 'MA_10', 'MA_20'])
df.isnull().sum()

Date                   0
Adj Close              0
Close                  0
High                   0
Low                    0
Open                   0
Volume                 0
Close_return           0
AdjClose_return        0
High_return            0
Low_return             0
Return                 0
Close_pct_change       0
AdjClose_pct_change    0
High_pct_change        0
Low_pct_change         0
High_Low_ratio         0
Close_Open_ratio       0
High_Close_ratio       0
Low_Open_ratio         0
MA_5                   0
MA_10                  0
MA_20                  0
dtype: int64

In [21]:
df.shape

(5976, 23)

In [22]:
# Set the output path for the dataset with moving averages
output_path_moving_averages = os.path.join(features_dir, "03_eda_with_moving_averages.csv")

# Save the dataframe to the new location
df.to_csv(output_path_moving_averages, index=False)

print(f"Dataset with moving averages saved to: {output_path_moving_averages}")


Dataset with moving averages saved to: /home/moraa-ontita/Documents/Machine-learning/Stock_Price_Prediction/artifacts/notebooks/features/03_eda_with_moving_averages.csv


## Volatility Indicators

In [23]:
# Volatility Indicators (Rolling Standard Deviation)
df.loc[:, 'Volatility_5'] = df['Close'].rolling(window=5).std()
df.loc[:, 'Volatility_10'] = df['Close'].rolling(window=10).std()

# High-Low Spread
df.loc[:, 'High_Low_spread'] = df['High'] - df['Low']
df.head()

Unnamed: 0,Date,Adj Close,Close,High,Low,Open,Volume,Close_return,AdjClose_return,High_return,...,High_Low_ratio,Close_Open_ratio,High_Close_ratio,Low_Open_ratio,MA_5,MA_10,MA_20,Volatility_5,Volatility_10,High_Low_spread
20,2000-02-01,7.250834,7.250834,7.253109,7.233304,7.240263,20.704083,0.001459,0.001459,0.001771,...,1.002738,1.00146,1.000314,0.999039,7.23936,7.252646,7.260537,,,0.019806
21,2000-02-02,7.250721,7.250721,7.258842,7.246717,7.250834,20.761139,-1.6e-05,-1.6e-05,0.00079,...,1.001673,0.999984,1.00112,0.999432,7.240075,7.249381,7.260882,,,0.012124
22,2000-02-03,7.261906,7.261906,7.262474,7.24317,7.250721,20.85998,0.001541,0.001541,0.0005,...,1.002665,1.001543,1.000078,0.998959,7.243816,7.247945,7.261691,,,0.019305
23,2000-02-04,7.261485,7.261485,7.269554,7.258856,7.261906,20.767378,-5.8e-05,-5.8e-05,0.000974,...,1.001474,0.999942,1.001111,0.99958,7.253042,7.24676,7.26243,,,0.010698
24,2000-02-07,7.261394,7.261394,7.263435,7.253704,7.261485,20.637817,-1.3e-05,-1.3e-05,-0.000842,...,1.001342,0.999987,1.000281,0.998928,7.257268,7.248367,7.261829,0.005928,,0.009731


In [24]:
df[['Volatility_5', 'Volatility_10', 'High_Low_spread']].isnull().sum()

Volatility_5       4
Volatility_10      9
High_Low_spread    0
dtype: int64

In [25]:
# Drop rows with NaN values in the new volatility columns
df = df.dropna(subset=['Volatility_5', 'Volatility_10', 'High_Low_spread'])


In [26]:
df.shape

(5967, 26)

In [27]:
# Set the output path for the dataset with volatility indicators
output_path_volatility = os.path.join(features_dir, "04_eda_with_volatility_indicators.csv")

# Save the dataframe to the new location
df.to_csv(output_path_volatility, index=False)

print(f"Dataset with volatility indicators saved to: {output_path_volatility}")


Dataset with volatility indicators saved to: /home/moraa-ontita/Documents/Machine-learning/Stock_Price_Prediction/artifacts/notebooks/features/04_eda_with_volatility_indicators.csv


## Momentum Indicators

In [28]:
# Rate of Change (ROC) - 10-day period
df['ROC_10'] = ((df['Close'] - df['Close'].shift(10)) / df['Close'].shift(10)) * 100

# Cumulative Return - Cumulative product of daily returns
df['Cumulative_return'] = (1 + df['Return']).cumprod() - 1


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5967 entries, 29 to 5995
Data columns (total 28 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   Date                 5967 non-null   datetime64[ns]
 1   Adj Close            5967 non-null   float64       
 2   Close                5967 non-null   float64       
 3   High                 5967 non-null   float64       
 4   Low                  5967 non-null   float64       
 5   Open                 5967 non-null   float64       
 6   Volume               5967 non-null   float64       
 7   Close_return         5967 non-null   float64       
 8   AdjClose_return      5967 non-null   float64       
 9   High_return          5967 non-null   float64       
 10  Low_return           5967 non-null   float64       
 11  Return               5967 non-null   float64       
 12  Close_pct_change     5967 non-null   float64       
 13  AdjClose_pct_change  5967 non-null   

In [30]:
df.head()

Unnamed: 0,Date,Adj Close,Close,High,Low,Open,Volume,Close_return,AdjClose_return,High_return,...,High_Close_ratio,Low_Open_ratio,MA_5,MA_10,MA_20,Volatility_5,Volatility_10,High_Low_spread,ROC_10,Cumulative_return
29,2000-02-14,7.237016,7.237016,7.2406,7.230223,7.234985,20.647788,0.000281,0.000281,-0.002149,...,1.000495,0.999342,7.250865,7.254067,7.254958,0.015745,0.011712,0.010377,,0.000281
30,2000-02-15,7.245691,7.245691,7.249727,7.227118,7.237016,20.811368,0.001198,0.001198,0.00126,...,1.000557,0.998632,7.245285,7.253552,7.253099,0.009303,0.01198,0.022609,,0.001479
31,2000-02-16,7.235381,7.235381,7.247472,7.233874,7.245691,20.741891,-0.001424,-0.001424,-0.000311,...,1.001671,0.998369,7.24185,7.252018,7.250699,0.009115,0.013293,0.013598,,5.3e-05
32,2000-02-17,7.235806,7.235806,7.244142,7.229889,7.235381,20.757474,5.9e-05,5.9e-05,-0.00046,...,1.001152,0.999241,7.237776,7.249408,7.248677,0.00449,0.013692,0.014252,,0.000112
33,2000-02-18,7.204959,7.204959,7.236044,7.204387,7.235806,20.764696,-0.004272,-0.004272,-0.001118,...,1.004314,0.995658,7.231771,7.243756,7.245258,0.015568,0.018849,0.031657,,-0.004161


In [31]:
df[['ROC_10', 'Cumulative_return']].isnull().sum()

ROC_10               10
Cumulative_return     0
dtype: int64

In [32]:
df = df.dropna()

In [33]:
# Set the output path for the dataset with momentum indicators
output_path_momentum = os.path.join(features_dir, "05_eda_with_momentum_indicators.csv")

# Save the dataframe to the new location
df.to_csv(output_path_momentum, index=False)

print(f"Dataset with momentum indicators saved to: {output_path_momentum}")


Dataset with momentum indicators saved to: /home/moraa-ontita/Documents/Machine-learning/Stock_Price_Prediction/artifacts/notebooks/features/05_eda_with_momentum_indicators.csv


## Volume-Based Features

In [34]:
# Volume Percentage Change (Volume_pct_change)
df['Volume_pct_change'] = df['Volume'].pct_change() * 100

# Volume Moving Averages (Volume_MA_5)
df['Volume_MA_5'] = df['Volume'].rolling(window=5).mean()

# Volume Price Trend (VPT)
# Initialize the VPT with the first value as 0
df['VPT'] = 0.0
df['VPT'] = df['Volume'] * (df['Close_return'])  # Volume * Daily return
df['VPT'] = df['VPT'].cumsum()  # Cumulative sum of the Volume Price Trend

# Check the first few rows to ensure the features are calculated
df[['Date', 'Volume', 'Volume_pct_change', 'Volume_MA_5', 'VPT']].head()


Unnamed: 0,Date,Volume,Volume_pct_change,Volume_MA_5,VPT
39,2000-02-29,20.909164,,,0.039235
40,2000-03-01,20.965506,0.269459,,0.066229
41,2000-03-02,20.90442,-0.291364,,0.071612
42,2000-03-03,20.863289,-0.196759,,0.128209
43,2000-03-06,20.751853,-0.534122,20.878846,0.091609


In [35]:
df[['Volume_pct_change', 'Volume_MA_5', 'VPT']].isnull().sum()

Volume_pct_change    1
Volume_MA_5          4
VPT                  0
dtype: int64

In [36]:
df = df.dropna()

In [37]:
# Save the dataframe with the new features to the specified directory
output_path_volume_features = "/home/moraa-ontita/Documents/Machine-learning/Stock_Price_Prediction/artifacts/notebooks/features/06_eda_with_volume_features.csv"

# Save the dataframe to CSV
df.to_csv(output_path_volume_features, index=False)

print(f"Dataset with volume features saved to: {output_path_volume_features}")


Dataset with volume features saved to: /home/moraa-ontita/Documents/Machine-learning/Stock_Price_Prediction/artifacts/notebooks/features/06_eda_with_volume_features.csv


## Lag Features

In [38]:
# Lag Features
df['Close_lag_1'] = df['Close'].shift(1)  # Close price from the previous day
df['Volume_lag_1'] = df['Volume'].shift(1)  # Volume from the previous day

# You can add more lag features as needed (e.g., lag_2, lag_3, etc.)
df['Close_lag_2'] = df['Close'].shift(2)  # Close price from two days ago
df['Volume_lag_2'] = df['Volume'].shift(2)  # Volume from two days ago

df['Close_lag_3'] = df['Close'].shift(3)  # Close price from three days ago
df['Volume_lag_3'] = df['Volume'].shift(3)  # Volume from three days ago

# Check the first few rows
df.head()


Unnamed: 0,Date,Adj Close,Close,High,Low,Open,Volume,Close_return,AdjClose_return,High_return,...,Cumulative_return,Volume_pct_change,Volume_MA_5,VPT,Close_lag_1,Volume_lag_1,Close_lag_2,Volume_lag_2,Close_lag_3,Volume_lag_3
43,2000-03-06,7.237979,7.237979,7.251161,7.233275,7.250756,20.751853,-0.001764,-0.001764,-0.000111,...,0.000391,-0.534122,20.878846,0.091609,,,,,,
44,2000-03-07,7.212014,7.212014,7.243663,7.207852,7.237979,20.996418,-0.003594,-0.003594,-0.001035,...,-0.003204,1.178519,20.896297,0.016152,7.237979,20.751853,,,,
45,2000-03-08,7.220154,7.220154,7.225329,7.205353,7.212014,20.908084,0.001128,0.001128,-0.002534,...,-0.00208,-0.420708,20.884813,0.039737,7.212014,20.996418,7.237979,20.751853,,
46,2000-03-09,7.245434,7.245434,7.245527,7.21368,7.220154,20.83927,0.003495,0.003495,0.002792,...,0.001408,-0.32913,20.871783,0.112573,7.220154,20.908084,7.212014,20.996418,7.237979,20.751853
47,2000-03-10,7.2407,7.2407,7.253796,7.238547,7.245434,20.853241,-0.000654,-0.000654,0.001141,...,0.000754,0.067044,20.869773,0.098944,7.245434,20.83927,7.220154,20.908084,7.212014,20.996418


In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5953 entries, 43 to 5995
Data columns (total 37 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   Date                 5953 non-null   datetime64[ns]
 1   Adj Close            5953 non-null   float64       
 2   Close                5953 non-null   float64       
 3   High                 5953 non-null   float64       
 4   Low                  5953 non-null   float64       
 5   Open                 5953 non-null   float64       
 6   Volume               5953 non-null   float64       
 7   Close_return         5953 non-null   float64       
 8   AdjClose_return      5953 non-null   float64       
 9   High_return          5953 non-null   float64       
 10  Low_return           5953 non-null   float64       
 11  Return               5953 non-null   float64       
 12  Close_pct_change     5953 non-null   float64       
 13  AdjClose_pct_change  5953 non-null   

In [40]:
# Optionally, save the updated dataframe with lag features to a file
output_path_lag_features = "/home/moraa-ontita/Documents/Machine-learning/Stock_Price_Prediction/artifacts/notebooks/features/07_eda_with_lag_features.csv"

# Save the dataframe with lag features
df.to_csv(output_path_lag_features, index=False)

print(f"Dataset with lag features saved to: {output_path_lag_features}")

Dataset with lag features saved to: /home/moraa-ontita/Documents/Machine-learning/Stock_Price_Prediction/artifacts/notebooks/features/07_eda_with_lag_features.csv


## Technical Indicators

In [41]:
import talib  

# 1. Relative Strength Index (RSI)
# RSI using Close prices and a window of 14 (common default)
df['RSI_14'] = talib.RSI(df['Close'], timeperiod=14)

# 2. Moving Average Convergence Divergence (MACD)
# MACD line and Signal line
df['MACD'], df['MACD_signal'], _ = talib.MACD(
    df['Close'],
    fastperiod=12,  # Fast EMA period
    slowperiod=26,  # Slow EMA period
    signalperiod=9,  # Signal line period
)

# 3. Bollinger Bands
# Upper, Middle (moving average), and Lower bands
df['BB_upper'], df['BB_middle'], df['BB_lower'] = talib.BBANDS(
    df['Close'],
    timeperiod=20,  # Window for moving average
    nbdevup=2,      # Number of std devs above the moving average
    nbdevdn=2,      # Number of std devs below the moving average
    matype=0,       # Moving average type (0 = simple moving average)
)



In [42]:
# Check the first few rows
df.head()


Unnamed: 0,Date,Adj Close,Close,High,Low,Open,Volume,Close_return,AdjClose_return,High_return,...,Close_lag_2,Volume_lag_2,Close_lag_3,Volume_lag_3,RSI_14,MACD,MACD_signal,BB_upper,BB_middle,BB_lower
43,2000-03-06,7.237979,7.237979,7.251161,7.233275,7.250756,20.751853,-0.001764,-0.001764,-0.000111,...,,,,,,,,,,
44,2000-03-07,7.212014,7.212014,7.243663,7.207852,7.237979,20.996418,-0.003594,-0.003594,-0.001035,...,,,,,,,,,,
45,2000-03-08,7.220154,7.220154,7.225329,7.205353,7.212014,20.908084,0.001128,0.001128,-0.002534,...,7.237979,20.751853,,,,,,,,
46,2000-03-09,7.245434,7.245434,7.245527,7.21368,7.220154,20.83927,0.003495,0.003495,0.002792,...,7.212014,20.996418,7.237979,20.751853,,,,,,
47,2000-03-10,7.2407,7.2407,7.253796,7.238547,7.245434,20.853241,-0.000654,-0.000654,0.001141,...,7.220154,20.908084,7.212014,20.996418,,,,,,


In [43]:
df.isnull().sum()

Date                    0
Adj Close               0
Close                   0
High                    0
Low                     0
Open                    0
Volume                  0
Close_return            0
AdjClose_return         0
High_return             0
Low_return              0
Return                  0
Close_pct_change        0
AdjClose_pct_change     0
High_pct_change         0
Low_pct_change          0
High_Low_ratio          0
Close_Open_ratio        0
High_Close_ratio        0
Low_Open_ratio          0
MA_5                    0
MA_10                   0
MA_20                   0
Volatility_5            0
Volatility_10           0
High_Low_spread         0
ROC_10                  0
Cumulative_return       0
Volume_pct_change       0
Volume_MA_5             0
VPT                     0
Close_lag_1             1
Volume_lag_1            1
Close_lag_2             2
Volume_lag_2            2
Close_lag_3             3
Volume_lag_3            3
RSI_14                 14
MACD        

In [44]:
# Drop any rows with NaN values due to calculation windows
df = df.dropna()


In [47]:
# Save the updated dataframe with technical indicators to a file
output_path_technical_indicators = "/home/moraa-ontita/Documents/Machine-learning/Stock_Price_Prediction/artifacts/notebooks/features/08_eda_with_technical_indicators.csv"

# Save the dataframe with technical indicators
df.to_csv(output_path_technical_indicators, index=False)

print(f"Dataset with technical indicators saved to: {output_path_technical_indicators}")

Dataset with technical indicators saved to: /home/moraa-ontita/Documents/Machine-learning/Stock_Price_Prediction/artifacts/notebooks/features/08_eda_with_technical_indicators.csv


In [46]:
df.shape

(5920, 43)

In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5920 entries, 76 to 5995
Data columns (total 43 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   Date                 5920 non-null   datetime64[ns]
 1   Adj Close            5920 non-null   float64       
 2   Close                5920 non-null   float64       
 3   High                 5920 non-null   float64       
 4   Low                  5920 non-null   float64       
 5   Open                 5920 non-null   float64       
 6   Volume               5920 non-null   float64       
 7   Close_return         5920 non-null   float64       
 8   AdjClose_return      5920 non-null   float64       
 9   High_return          5920 non-null   float64       
 10  Low_return           5920 non-null   float64       
 11  Return               5920 non-null   float64       
 12  Close_pct_change     5920 non-null   float64       
 13  AdjClose_pct_change  5920 non-null   

In [49]:
# Ensure 'Date' is in datetime format
df['Date'] = pd.to_datetime(df['Date'])

# Extract year, month, and day of the week
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day_of_Week'] = df['Date'].dt.dayofweek  # Monday=0, Sunday=6

# Add flags for month-end and quarter-end
df['Is_Month_End'] = df['Date'].dt.is_month_end
df['Is_Quarter_End'] = df['Date'].dt.is_quarter_end

# Display the updated DataFrame
df.head()


Unnamed: 0,Date,Adj Close,Close,High,Low,Open,Volume,Close_return,AdjClose_return,High_return,...,MACD,MACD_signal,BB_upper,BB_middle,BB_lower,Year,Month,Day_of_Week,Is_Month_End,Is_Quarter_End
76,2000-04-20,7.2686,7.2686,7.269262,7.259876,7.263659,20.613674,0.00068,0.00068,-0.001164,...,0.001102,0.01355,7.358125,7.297112,7.236098,2000,4,3,False,False
77,2000-04-24,7.265332,7.265332,7.2686,7.249307,7.2686,20.582508,-0.00045,-0.00045,-9.1e-05,...,-1.2e-05,0.010838,7.354196,7.29381,7.233424,2000,4,0,False,False
78,2000-04-25,7.298066,7.298066,7.298222,7.265332,7.265332,20.791952,0.004495,0.004495,0.004067,...,0.001727,0.009016,7.350511,7.292263,7.234016,2000,4,1,False,False
79,2000-04-26,7.28687,7.28687,7.301782,7.284121,7.298066,20.722866,-0.001535,-0.001535,0.000488,...,0.002177,0.007648,7.347719,7.290689,7.233658,2000,4,2,False,False
80,2000-04-27,7.289556,7.289556,7.29248,7.268788,7.28687,20.828526,0.000369,0.000369,-0.001275,...,0.002719,0.006662,7.344766,7.289222,7.233678,2000,4,3,False,False


In [50]:
df.isnull().sum()

Date                   0
Adj Close              0
Close                  0
High                   0
Low                    0
Open                   0
Volume                 0
Close_return           0
AdjClose_return        0
High_return            0
Low_return             0
Return                 0
Close_pct_change       0
AdjClose_pct_change    0
High_pct_change        0
Low_pct_change         0
High_Low_ratio         0
Close_Open_ratio       0
High_Close_ratio       0
Low_Open_ratio         0
MA_5                   0
MA_10                  0
MA_20                  0
Volatility_5           0
Volatility_10          0
High_Low_spread        0
ROC_10                 0
Cumulative_return      0
Volume_pct_change      0
Volume_MA_5            0
VPT                    0
Close_lag_1            0
Volume_lag_1           0
Close_lag_2            0
Volume_lag_2           0
Close_lag_3            0
Volume_lag_3           0
RSI_14                 0
MACD                   0
MACD_signal            0


In [51]:
# Save the updated dataframe with technical indicators to a file
output_path_technical_indicators = "/home/moraa-ontita/Documents/Machine-learning/Stock_Price_Prediction/artifacts/notebooks/features/09_stock_data_with_datetime_features.csv"

# Save the dataframe with technical indicators
df.to_csv(output_path_technical_indicators, index=False)

print(f"Dataset with technical indicators saved to: {output_path_technical_indicators}")

Dataset with technical indicators saved to: /home/moraa-ontita/Documents/Machine-learning/Stock_Price_Prediction/artifacts/notebooks/features/09_stock_data_with_datetime_features.csv
