# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
from typing import List
from scipy.io import arff

# Data Loading

In [2]:
# Get all the file paths
paths = glob("../data/raw/*.arff")

In [3]:
dataset_names = [path.split("/")[-1].split(".")[0].split("\\")[-1] for path in paths]

In [4]:
dfs: List[pd.DataFrame] = []

# Load the arff data into a pandas dataframe
for path in paths:
    data, meta = arff.loadarff(path)
    df = pd.DataFrame(data)
    dfs.append(df)

# Data Cleaning

From EDA we found that:
- The `Defected` column needs to be encoded to 0 and 1
- There are missing values needs to be Imputed or Dropped
- There are duplicate values needs to be dropped


In [5]:
def clean_df(df: pd.DataFrame) -> pd.DataFrame:
    cleaned_df = df.copy()

    # Map the class to a binary value
    cleaned_df.Defective = cleaned_df.Defective.map({b"Y": 1, b"N": 0}).astype(bool)

    # Drop rows with missing values
    cleaned_df.dropna(inplace=True)

    # Drop duplicate rows
    cleaned_df.drop_duplicates(inplace=True)

    return cleaned_df

In [6]:
for df, dataset_name in zip(dfs, dataset_names):
    clean_df(df).to_csv(f"../data/processed/{dataset_name}.csv", index=False)