# 1. Cleaning data

In [139]:
import pandas as pd
import numpy as np
df=pd.read_csv("DASH_A1.csv")
df

Unnamed: 0,Date,Close,High,Low,Open,Volume
0,24-02-2022,100.419998,100.919998,85.177002,86.879997,6639000.0
1,01-08-2024,108.199997,112.769997,105.905998,108.620003,7965400.0
2,11-02-2025,193.089996,194.0,189.5,190.919998,6771900.0
3,13-04-2021,149.460007,150.360001,143.550003,146.839996,2823500.0
4,17-09-2024,129.880005,131.369995,126.900002,131.350006,2825500.0
5,06-09-2022,58.23,58.860001,56.950001,58.259998,5699000.0
6,15-11-2024,169.429993,173.509995,169.100006,173.509995,3601600.0
7,18-07-2024,100.279999,104.330002,99.889999,103.059998,6227000.0
8,02-03-2022,103.620003,104.32,100.0,103.599998,3297800.0
9,22-04-2022,89.0,95.5,88.690002,91.489998,5625000.0


In [140]:
# Convert date to datetime
df['Date'] = pd.to_datetime(df['Date'], dayfirst=True)
df.sort_values('Date', inplace=True)

In [141]:
#Clean data
df['Close'] = df['Close'].ffill()
df['Open'] = df['Open'].fillna(df['Close'].shift(1))
df['Month'] = df['Date'].dt.month
monthly_high_mean = df.groupby('Month')['High'].transform('mean')
df['High'] = df['High'].fillna(monthly_high_mean)
monthly_low_mean = df.groupby('Month')['Low'].transform('mean')
df['Low'] = df['Low'].fillna(monthly_low_mean)
df.drop('Month', axis=1, inplace=True)
df.isnull().sum()

Date       0
Close      0
High       0
Low        0
Open       0
Volume    26
dtype: int64

In [142]:
# Identify missing volume rows
missing_volume = df['Volume'].isna()
# Rule 4: Close == Open -> fill missing Volume with 0
zero_vol_mask = missing_volume & (df['Close'] == df['Open'])
df.loc[zero_vol_mask, 'Volume'] = 0

# Rule 5: Close != Open -> fill missing Volume with median of existing volumes
non_zero_vol_mask = missing_volume & (df['Close'] != df['Open'])
volume_median = df['Volume'].median()
df.loc[non_zero_vol_mask, 'Volume'] = volume_median

In [143]:
df.isnull().sum()

Date      0
Close     0
High      0
Low       0
Open      0
Volume    0
dtype: int64

In [144]:
df.to_csv('cleaned_data.csv')

## 2. Feature engineering

In [145]:
import pandas as pd
#Set date as index
df.set_index('Date', inplace=True)

# 1. Simple Daily Returns
df['Daily_Return'] = df['Close'].pct_change()

# 2. Logarithmic Returns
df["PrevClose"] = df["Close"].shift(1)
df['LogReturn'] = np.log(df['Close'] / df["PrevClose"])

# 3. 20-Day Momentum
df['20D_Momentum'] = df['Close'] - df['Close'].shift(20)

# 4. 20-Day Simple Moving Average (SMA)
df['20D_SMA'] = df['Close'].rolling(window=20).mean()

# 5. 20-Day Rolling Volatility
df['20D_Volatility'] = df['Daily_Return'].rolling(window=20).std()

# 6. Day of the Week
df['Day_of_the_week'] = df.index.strftime('%a')

# 7. Price Surge Identification
mean_return = df["Daily_Return"].mean()
std_return = df["Daily_Return"].std()
return_threshold = mean_return + (4 * std_return)

condition = df["Daily_Return"] > return_threshold

df["Price_Surge"] = condition

# 8. Volume Spike Identification
mean_volume = df["Volume"].mean()
std_volume = df["Volume"].std()
volume_threshold = mean_volume + (6 * std_volume)

condition = df["Volume"] > volume_threshold

df["Volume_Spike"] = condition

# 9. Bollinger Bands (20-day SMA ± 2 std)
df['Dev'] = df['Close'].rolling(window=20).std()
df['BB_High'] = df['20D_SMA'] + 2 * df['Dev']
df['BB_Low'] = df['20D_SMA'] - 2 * df['Dev']


In [146]:
pd.set_option('display.max_rows', None)
df

Unnamed: 0_level_0,Close,High,Low,Open,Volume,Daily_Return,PrevClose,LogReturn,20D_Momentum,20D_SMA,20D_Volatility,Day_of_the_week,Price_Surge,Volume_Spike,Dev,BB_High,BB_Low
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2020-12-09,189.509995,195.5,163.800003,182.0,25373700.0,,,,,,,Wed,False,False,,,
2020-12-10,186.0,187.695007,172.636002,179.710007,3506600.0,-0.018521,189.509995,-0.018695,,,,Thu,False,False,,,
2020-12-11,175.0,182.0,168.25,176.520004,4760600.0,-0.05914,186.0,-0.060961,,,,Fri,False,False,,,
2020-12-14,160.0,170.0,151.199997,169.100006,7859600.0,-0.085714,175.0,-0.089612,,,,Mon,False,False,,,
2020-12-15,158.889999,161.419998,153.759995,157.100006,5017000.0,-0.006938,160.0,-0.006962,,,,Tue,False,False,,,
2020-12-15,158.889999,161.419998,153.759995,157.100006,5017000.0,0.0,158.889999,0.0,,,,Tue,False,False,,,
2020-12-16,158.050003,167.470001,157.199997,161.179993,3061300.0,-0.005287,158.889999,-0.005301,,,,Wed,False,False,,,
2020-12-17,154.210007,161.660004,123.524465,159.100006,6369800.0,-0.024296,158.050003,-0.024596,,,,Thu,False,False,,,
2020-12-18,166.350006,169.949997,147.320007,154.210007,8157800.0,0.078724,154.210007,0.075779,,,,Fri,False,False,,,
2020-12-21,160.229996,173.460007,155.0,169.070007,3884200.0,-0.03679,166.350006,-0.037484,,,,Mon,False,False,,,
