** Cleaning Stock Market dataset

In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

- load the dataset

In [16]:
Original_FILE_PATH = Path("../data/raw/orignial_sp500.csv")
df= pd.read_csv(Original_FILE_PATH,skiprows=2)


- Rename the columns

In [17]:
columns_names= ["Date","Open","High","Low","Close","Volume"]

if list(df.columns) != columns_names:
    print("Renaming done")
    df.columns=columns_names
else:
    print("Names are correct")    

Renaming done


- Conver Date column's data type

In [18]:
if df["Date"].dtype != np.dtype('datetime64[ns]'):
    print("Converting datatype...")
    df["Date"]= pd.to_datetime(df["Date"])
else:
    print("Correct data type")

Converting datatype...


Ensure all other columns are numeric 

In [19]:
numeric_columns = ["Open","High","Low","Close","Volume"]

for col in numeric_columns:
    if not pd.api.types.is_numeric_dtype(df[col]):
        print(f"Converted {col} to Numeric..")
        df[col]=pd.to_numeric(df[col])
    else:
        print(f"{col} has correct data type")
print("\n ......  Perfect!  ....")

Open has correct data type
High has correct data type
Low has correct data type
Close has correct data type
Volume has correct data type

 ......  Perfect!  ....


- Sort & Set index

In [20]:
# sort
if not df["Date"].is_monotonic_increasing:
    print("Sort..")
    pd.sort_values(df["Date"])
else:
    print("Already sorted")
# set index
if df.index.name != "Date":
    print("\n Index set")
    df=df.set_index(df["Date"])
else:
    print("Already set")

Already sorted

 Index set


- Save version 2

In [21]:
PROCESSED_DIRE = Path("../data/processed")
PROCESSED_DIRE.mkdir(exist_ok=True)

version_2_dire = PROCESSED_DIRE/"version_2_sp500.csv"

if not version_2_dire.exists():
    print("Verion 2 added")
    df.to_csv(version_2_dire)
else:
    print("Version 2 already exists")


Version 2 already exists


- Check the existent of misiing data

In [22]:
missing_data = df.isna().sum()

if missing_data.sum() == 0:
    print("No missing data foumd")
else:
    print("Data should be cleaned")

No missing data foumd


- Check the existent of duplicated values

In [23]:
duplicated = df.duplicated().sum()

if duplicated.sum() == 0:
    print("No duplication")
else:
    print("Duplication found and should be handled")

No duplication


- Index column : duplicate values are prohibted
    * Check if there are duplicate values in Date column

In [24]:
index_dup_checker = df["Date"].duplicated().sum()

if index_dup_checker.sum() == 0:
    print("PERFECT!, no duplication in index column")
else:
    print("Duplication in index column found and should be handled")

PERFECT!, no duplication in index column


- Check valid prices
    * low < open
    * high > close   etc...

In [25]:
invalid_prices = df[  (df["Open"]<df["Low"]) | (df["High"]<df["Close"]) | (df["High"]<df["Low"])  | (df["High"]<df["Open"]) | (df["Low"] > df["Close"])     ]

if invalid_prices.empty:
    print("Prices are Valid")
else: 
    print("Prices are Invalid")

Prices are Valid


- Check valid Volume
   * Not negative

In [26]:

invalid_volume= df [ (df["Volume"] < 0 )]

if invalid_volume.empty:
    print("Volumes are Valid")
else: 
    print("Volumes are Invalid")

Volumes are Valid


- Check Impossible prices
  * Not Negative
  * Not Zero

In [27]:
columns = ["Open","High","Low","Close"]
impos_prices = df[(df[columns] <= 0).any(axis=1)]
if impos_prices.empty:
    print(" Columns Valid prices")
else: 
    print("Column contain Invalid prices")


 Columns Valid prices


- Save Version 3

In [28]:
version_3_dire = PROCESSED_DIRE/"version_3_sp500.csv"

if not version_3_dire.exists():
    print("Verion 3 added")
    df.to_csv(version_3_dire)
else:
    print("Version 3 already exists")

Verion 3 added
