In [6]:
# 1. Import libraries
import pandas as pd
from google.colab import files

# 2. Load dataset
try:
    df = pd.read_csv("Mall_Customers.csv")
except FileNotFoundError:
    print("Mall_Customers.csv not found. Please upload the file.")
    uploaded = files.upload()
    if 'Mall_Customers.csv' in uploaded:
        with open('Mall_Customers.csv', 'wb') as f:
            f.write(uploaded['Mall_Customers.csv'])
        df = pd.read_csv("Mall_Customers.csv")
        print("File uploaded and loaded successfully.")
    else:
        print("Failed to upload Mall_Customers.csv.")
        exit()


# 3. Quick view
print(df.shape)
print(df.head())

# 4. Standardize column names
df.columns = ["customer_id", "gender", "age", "annual_income_k", "spending_score"]

# 5. Check duplicates
print("Duplicates:", df.duplicated().sum())
df = df.drop_duplicates()

# 6. Check missing values
print("Missing values:\n", df.isnull().sum())

# 7. Fix data types
df["customer_id"] = df["customer_id"].astype(int)
df["age"] = pd.to_numeric(df["age"], errors="coerce")
df["annual_income_k"] = pd.to_numeric(df["annual_income_k"], errors="coerce")
df["spending_score"] = pd.to_numeric(df["spending_score"], errors="coerce")

# 8. Standardize categorical values
df["gender"] = df["gender"].str.strip().str.title()

# 9. Outlier check
print(df.describe())

# Optional: remove extreme ages if found
df = df[df["age"].between(10, 90)]

# 10. Save cleaned dataset
df.to_csv("mall_customers_cleaned.csv", index=False)
print("Cleaned dataset saved!")

(200, 5)
   CustomerID   Genre  Age  Annual Income (k$)  Spending Score (1-100)
0           1    Male   19                  15                      39
1           2    Male   21                  15                      81
2           3  Female   20                  16                       6
3           4  Female   23                  16                      77
4           5  Female   31                  17                      40
Duplicates: 0
Missing values:
 customer_id        0
gender             0
age                0
annual_income_k    0
spending_score     0
dtype: int64
       customer_id         age  annual_income_k  spending_score
count   200.000000  200.000000       200.000000      200.000000
mean    100.500000   38.850000        60.560000       50.200000
std      57.879185   13.969007        26.264721       25.823522
min       1.000000   18.000000        15.000000        1.000000
25%      50.750000   28.750000        41.500000       34.750000
50%     100.500000   36.000000  

In [None]:
import os
os.listdir()

['.config', 'Mall_Customers.csv', 'mall_customers_cleaned.csv', 'sample_data']

In [None]:
df.head()

Unnamed: 0,customer_id,gender,age,annual_income_k,spending_score
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


In [None]:
df.shape

(200, 5)