In [5]:
import pandas as pd

# Read the dataset
df = pd.read_csv("clothing_store_dataset.csv")

# Drop rows with missing Product_ID, Customer_ID, Date_of_Purchase
df = df.dropna(subset=["Product_ID", "Customer_ID", "Date_of_Purchase"])

# Define numeric and categorical columns
numeric_cols = ["Customer_Age", "Quantity", "Total_Purchase_Amount",
                "Profit_or_Loss", "Ratings", "Year_of_Purchase"]

categorical_cols = ["Product_Name", "Country", "Product_Type", ]

categorical_cols2 = [ "Customer_Name", "Gender", 
                    "Mode_of_Purchase", "Month_of_Purchase"]


# Fill numeric columns with median
for col in numeric_cols:
    df[col] = df[col].fillna(df[col].median())

# Fill categorical columns with mode
for col in categorical_cols:
    df[col] = df[col].fillna('Others')

for col in categorical_cols2:
    df[col] = df[col].fillna('Unknown')

# Convert and standardize numeric types
df["Customer_ID"] = df["Customer_ID"].astype(int)
df["Customer_Age"] = df["Customer_Age"].astype(int)
df["Quantity"] = df["Quantity"].astype(int)
df["Year_of_Purchase"] = df["Year_of_Purchase"].astype(int)
df["Total_Purchase_Amount"] = df["Total_Purchase_Amount"].astype(float)
df["Profit_or_Loss"] = df["Profit_or_Loss"].astype(float)
df["Ratings"] = df["Ratings"].astype(float)

# Convert date column and format as DD-MM-YYYY
df["Date_of_Purchase"] = pd.to_datetime(df["Date_of_Purchase"], errors="coerce")
df["Date_of_Purchase"] = df["Date_of_Purchase"].dt.strftime("%d-%m-%Y")

# Standardize Gender column
df["Gender"] = df["Gender"].str.title().replace({
    "M": "Male", "F": "Female", "O": "Other"
})

# Standardize Mode_of_Purchase column
df["Mode_of_Purchase"] = df["Mode_of_Purchase"].str.title().replace({
    "In person": "In-Person", "Inperson": "In-Person", "Offline": "In-Person"
})

# Remove duplicate rows
df = df.drop_duplicates()

# Reset index
df = df.reset_index(drop=True)



In [6]:
df.isnull().sum()

Product_ID               0
Product_Name             0
Customer_ID              0
Customer_Name            0
Customer_Age             0
Gender                   0
Country                  0
Quantity                 0
Total_Purchase_Amount    0
Profit_or_Loss           0
Date_of_Purchase         0
Mode_of_Purchase         0
Ratings                  0
Product_Type             0
Month_of_Purchase        0
Year_of_Purchase         0
dtype: int64

In [7]:
# Save cleaned and standardized dataset
df.to_csv("clothing_store_dataset_clean.csv", index=False)
print("Cleaned and standardized dataset saved as clothing_store_dataset_clean.csv")


Cleaned and standardized dataset saved as clothing_store_dataset_clean.csv
