In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('Mall_Customers.csv')

# Show first few rows
print("Original Dataset:")
print(df.head())

# 1. Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# 2. Remove duplicates
df = df.drop_duplicates()

# 3. Standardize column names
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

# 4. Check and convert data types if needed
df['annual_income_(k$)'] = pd.to_numeric(df['annual_income_(k$)'], errors='coerce')
df['spending_score_(1-100)'] = pd.to_numeric(df['spending_score_(1-100)'], errors='coerce')

# 5. Final summary
print("\nCleaned Dataset:")
print(df.head())

# Save cleaned file
df.to_csv("Cleaned_Mall_Customers.csv", index=False)

# Summary of changes
summary = {
    "Removed Nulls": True,
    "Removed Duplicates": True,
    "Standardized Column Names": True,
    "Fixed Data Types": True
}

print("\nSummary of Changes:")
print(summary)

Original Dataset:
   CustomerID  Gender  Age  Annual Income (k$)  Spending Score (1-100)
0           1    Male   19                  15                      39
1           2    Male   21                  15                      81
2           3  Female   20                  16                       6
3           4  Female   23                  16                      77
4           5  Female   31                  17                      40

Missing Values:
CustomerID                0
Gender                    0
Age                       0
Annual Income (k$)        0
Spending Score (1-100)    0
dtype: int64

Cleaned Dataset:
   customerid  gender  age  annual_income_(k$)  spending_score_(1-100)
0           1    Male   19                  15                      39
1           2    Male   21                  15                      81
2           3  Female   20                  16                       6
3           4  Female   23                  16                      77
4           5