In [10]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv("DASH_A1.csv")

# Display basic info and preview
df.info(), df.head()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1184 entries, 0 to 1183
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    1184 non-null   object 
 1   Close   1159 non-null   float64
 2   High    1160 non-null   float64
 3   Low     1168 non-null   float64
 4   Open    1167 non-null   float64
 5   Volume  1158 non-null   float64
dtypes: float64(5), object(1)
memory usage: 55.6+ KB


(None,
          Date       Close        High         Low        Open     Volume
 0  24-02-2022  100.419998  100.919998   85.177002   86.879997  6639000.0
 1  01-08-2024  108.199997  112.769997  105.905998  108.620003  7965400.0
 2  11-02-2025  193.089996  194.000000  189.500000  190.919998  6771900.0
 3  13-04-2021  149.460007  150.360001  143.550003  146.839996  2823500.0
 4  17-09-2024  129.880005  131.369995  126.900002  131.350006  2825500.0)

In [11]:
# Convert Date column to datetime and sort
df['Date'] = pd.to_datetime(df['Date'], format="%d-%m-%Y")
df.sort_values('Date', inplace=True)
df.reset_index(drop=True, inplace=True)

# Fill missing Open values with previous day's Close
df['Open'] = df['Open'].fillna(method='ffill')

# Fill missing Close values using forward fill
df['Close'] = df['Close'].fillna(method='ffill')

# Fill missing High/Low with monthly mean
df['Month'] = df['Date'].dt.to_period('M')
df['High'] = df.groupby('Month')['High'].transform(lambda x: x.fillna(x.mean()))
df['Low'] = df.groupby('Month')['Low'].transform(lambda x: x.fillna(x.mean()))

# Fill missing Volume values using conditional rules
volume_median = df['Volume'].median()

def fill_volume(row):
    if pd.notnull(row['Volume']):
        return row['Volume']
    elif row['Open'] == row['Close']:
        return 0
    else:
        return volume_median

df['Volume'] = df.apply(fill_volume, axis=1)

# Drop the helper 'Month' column
df.drop(columns=['Month'], inplace=True)

# Summary of missing values after cleaning
missing_summary = df.isnull().sum()
missing_summary



  df['Open'] = df['Open'].fillna(method='ffill')
  df['Close'] = df['Close'].fillna(method='ffill')


Date      0
Close     0
High      0
Low       0
Open      0
Volume    0
dtype: int64

In [14]:
# --- Feature Engineering ---

# 1. Simple Daily Returns
df['Simple Return'] = df['Close'].pct_change()

# 2. Logarithmic Returns
df['Log Return'] = np.log(df['Close'] / df['Close'].shift(1))

# 3. 20-Day Momentum
df['20D Momentum'] = df['Close'] - df['Close'].shift(20)

# 4. 20-Day Simple Moving Average
df['20D SMA'] = df['Close'].rolling(window=20).mean()

# 5. 20-Day Rolling Volatility
df['20D Volatility'] = df['Simple Return'].rolling(window=20).std()

# 6. Day of the Week
df['Weekday'] = df['Date'].dt.day_name()

# 7. Price Surge Identification
mean_return = df['Simple Return'].mean()
std_return = df['Simple Return'].std()
df['Price Surge'] = df['Simple Return'] > (mean_return + 4 * std_return)

# 8. Volume Spike Identification
mean_volume = df['Volume'].mean()
std_volume = df['Volume'].std()
df['Volume Spike'] = df['Volume'] > (mean_volume + 6 * std_volume)

# 9. Bollinger Bands
df['Upper Band'] = df['20D SMA'] + 2 * df['20D Volatility']
df['Lower Band'] = df['20D SMA'] - 2 * df['20D Volatility']

df


Unnamed: 0,Date,Close,High,Low,Open,Volume,Simple Return,Log Return,20D Momentum,20D SMA,20D Volatility,Weekday,Price Surge,Volume Spike,Upper Band,Lower Band
0,2020-12-09,189.509995,195.500000,163.800003,182.000000,25373700.0,,,,,,Wednesday,False,False,,
1,2020-12-10,186.000000,187.695007,172.636002,179.710007,3506800.0,-0.018521,-0.018695,,,,Thursday,False,False,,
2,2020-12-11,175.000000,182.000000,168.250000,176.520004,4760600.0,-0.059140,-0.060961,,,,Friday,False,False,,
3,2020-12-14,160.000000,170.000000,151.199997,169.100006,7859600.0,-0.085714,-0.089612,,,,Monday,False,False,,
4,2020-12-15,158.889999,161.419998,153.759995,157.100006,5017000.0,-0.006938,-0.006962,,,,Tuesday,False,False,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1179,2025-06-09,217.490005,219.830002,216.955002,218.029999,2710300.0,-0.004395,-0.004404,33.970001,206.409998,0.017753,Monday,False,False,206.445504,206.374493
1180,2025-06-10,214.970001,219.210007,210.927002,216.589996,3916700.0,-0.011587,-0.011654,22.869995,207.553498,0.015859,Tuesday,False,False,207.585216,207.521780
1181,2025-06-11,217.800003,219.529999,212.240005,214.184998,3091500.0,0.013165,0.013079,20.000000,208.553498,0.014953,Wednesday,False,False,208.583403,208.523593
1182,2025-06-12,216.600006,219.419998,215.675003,218.080002,2510400.0,-0.005510,-0.005525,19.760010,209.541499,0.014976,Thursday,False,False,209.571450,209.511547
