# Data Cleaning

In [7]:
import pandas as pd
import numpy as np

df = pd.read_csv("/workspaces/python-for-finance-jume8jume7/data/DASH_A1.csv")

df.drop_duplicates(inplace=True)
df["Date"] = pd.to_datetime(df["Date"], format="%d-%m-%Y")
df.set_index("Date", inplace=True)

# Fill missing Close with linear interpolation
df["Close"] = df["Close"].interpolate(method="linear")

# Fill missing Open with previous day Close
df["Open"] = df["Open"].fillna(df["Close"].shift(1))

# Fill missing Volume with forward-fill
df["Volume"] = df["Volume"].fillna(method="ffill")

# Fill High with max(Close, Open)
df["High"] = df["High"].fillna(df[["Close", "Open"]].max(axis=1))

# Fill Low with 3% less than High
df["Low"] = df["Low"].fillna(df["High"] * 0.97)

# Add monthly label
df["Month"] = df.index.to_period("M")

# Check if any missing values remain
print(df.isnull().sum())

Close     0
High      0
Low       0
Open      0
Volume    0
Month     0
dtype: int64


  df["Volume"] = df["Volume"].fillna(method="ffill")


In [9]:
df

Unnamed: 0_level_0,Close,High,Low,Open,Volume,Month
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-02-24,100.419998,100.919998,85.177002,86.879997,6639000.0,2022-02
2024-08-01,108.199997,112.769997,105.905998,108.620003,7965400.0,2024-08
2025-02-11,193.089996,194.000000,189.500000,190.919998,6771900.0,2025-02
2021-04-13,149.460007,150.360001,143.550003,146.839996,2823500.0,2021-04
2024-09-17,129.880005,131.369995,126.900002,131.350006,2825500.0,2024-09
...,...,...,...,...,...,...
2024-11-27,178.440002,180.179993,177.699997,179.990005,2031100.0,2024-11
2025-02-12,200.889999,201.169998,195.197998,198.000000,9989400.0,2025-02
2025-04-01,182.419998,183.014999,178.259995,182.050003,3740700.0,2025-04
2024-03-25,137.820007,138.899994,136.740005,137.050003,2162800.0,2024-03
