** Cleaning Stock Market dataset

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

- load the dataset

In [None]:
Original_FILE_PATH = Path("../data/raw/orignial_sp500.csv")
df= pd.read_csv(Original_FILE_PATH,skiprows=2)


- Rename the columns

In [None]:
columns_names= ["Date","Open","High","Low","Close","Volume"]

if list(df.columns) != columns_names:
    print("Renaming done")
    df.columns=columns_names
else:
    print("Names are correct")    

- Conver Date column's data type

In [None]:
if df["Date"].dtype != np.dtype('datetime64[ns]'):
    print("Converting datatype...")
    df["Date"]= pd.to_datetime(df["Date"])
else:
    print("Correct data type")

Ensure all other columns are numeric 

In [None]:
numeric_columns = ["Open","High","Low","Close","Volume"]

for col in numeric_columns:
    if not pd.api.types.is_numeric_dtype(df[col]):
        print(f"Converted {col} to Numeric..")
        df[col]=pd.to_numeric(df[col])
    else:
        print(f"{col} has correct data type")
print("\n ......  Perfect!  ....")

- Sort & Set index

In [None]:
# sort
if not df["Date"].is_monotonic_increasing:
    print("Sort..")
    pd.sort_values(df["Date"])
else:
    print("Already sorted")
# set index
if df.index.name != "Date":
    print("\n Index set")
    df=df.set_index(df["Date"])
else:
    print("Already set")

- Save version 2

In [None]:
PROCESSED_DIRE = Path("../data/processed")
PROCESSED_DIRE.mkdir(exist_ok=True)

version_2_dire = PROCESSED_DIRE/"version_2_sp500.csv"

if not version_2_dire.exists():
    print("Verion 2 added")
    df.to_csv(version_2_dire)
else:
    print("Version 2 already exists")


- Check the existent of misiing data

In [None]:
missing_data = df.isna().sum()

if missing_data.sum() == 0:
    print("No missing data foumd")
else:
    print("Data should be cleaned")

- Check the existent of duplicated values

In [None]:
duplicated = df.duplicated().sum()

if duplicated.sum() == 0:
    print("No duplication")
else:
    print("Duplication found and should be handled")

- Index column : duplicate values are prohibted
    * Check if there are duplicate values in Date column

In [None]:
index_dup_checker = df["Date"].duplicated().sum()

if index_dup_checker.sum() == 0:
    print("PERFECT!, no duplication in index column")
else:
    print("Duplication in index column found and should be handled")

- Check valid prices
    * low < open
    * high > close   etc...

In [None]:
invalid_prices = df[  (df["Open"]<df["Low"]) | (df["High"]<df["Close"]) | (df["High"]<df["Low"])  | (df["High"]<df["Open"]) | (df["Low"] > df["Close"])     ]

if invalid_prices.empty:
    print("Prices are Valid")
else: 
    print("Prices are Invalid")

- Check valid Volume
   * Not negative

In [None]:

invalid_volume= df [ (df["Volume"] < 0 )]

if invalid_volume.empty:
    print("Volumes are Valid")
else: 
    print("Volumes are Invalid")

- Check Impossible prices
  * Not Negative
  * Not Zero

In [None]:
columns = ["Open","High","Low","Close"]
impos_prices = df[(df[columns] <= 0).any(axis=1)]
if impos_prices.empty:
    print(" Columns Valid prices")
else: 
    print("Column contain Invalid prices")


- Save Version 3

In [None]:
version_3_dire = PROCESSED_DIRE/"version_3_sp500.csv"

if not version_3_dire.exists():
    print("Verion 3 added")
    df.to_csv(version_3_dire)
else:
    print("Version 3 already exists")

- Check missing trading dates

In [None]:
trading_dates=pd.date_range(start=df.index.min(), end=df.index.max(),freq="B")
missing_dates= trading_dates.difference(df.index)

if len(missing_data)==0:
    print("No Gaps between trading dates")
else:
    print("There are missing dates")
    display(missing_dates[:20])

- Check extreme daily prices
    * SP500 daily price doesn't go beyond 10%

In [None]:
daily_price = df["Close"].pct_change()

extreme = df[daily_price.abs() > 0.10]

if extreme.empty:
    print("Not much difference")
else:
    print("There are huge gabs in some days")
    display(extreme)

- Check abnormal high/low

In [None]:
range = (df["High"]-df["Low"])/df["Close"]
abnormal = df[range>0.15]

if abnormal.empty:
    print("Normal")
else:
    print("Abnormal")
    display(abnormal)

In [None]:
df["volume30"]= df["Volume"].rolling(30).mean()
df["v_ratio"]= df["Volume"]/df["volume30"]
outlier = df[(df["v_ratio"]>5) | (df["v_ratio"]<0.2)]

if outlier.empty:
    print("No volume outliers")
else:
    print("Outliers exist")

In [None]:
df.head()

In [None]:
df = df.drop(columns=["volume30", "v_ratio"], errors="ignore")

- Save version 4

In [None]:
version_4_dire = PROCESSED_DIRE/"version_4_sp500.csv"

if not version_4_dire.exists():
    print("Version 4 added")
    df.to_csv(version_4_dire)
else:
    print("Version 4 already exists")

In [None]:
df.info()