## Import Required Libraries

In [1]:
import pandas as pd

## Load dataset

In [2]:
df = pd.read_csv('Mall_Customers.csv') # Load Dataset 

In [3]:
df.head(10) # Print Top 10 Rows 

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40
5,6,Female,22,17,76
6,7,Female,35,18,6
7,8,Female,23,18,94
8,9,Male,64,19,3
9,10,Female,30,19,72


## Shows number of nulls per column

In [4]:
print(df.isnull().sum())  

CustomerID                0
Gender                    0
Age                       0
Annual Income (k$)        0
Spending Score (1-100)    0
dtype: int64


## Remove duplicate Or Check duplicate

In [11]:
duplicates

0      False
1      False
2      False
3      False
4      False
       ...  
195    False
196    False
197    False
198    False
199    False
Length: 200, dtype: bool

In [12]:
duplicates = df.duplicated()

# Show only duplicate rows
duplicate_rows = df[df.duplicated()]
print(duplicate_rows)


Empty DataFrame
Columns: [CustomerID, Gender, Age, Annual Income (k$), Spending Score (1-100)]
Index: []


## 5. Standardize Text Columns

In [7]:
df['Gender'] = df['Gender'].str.strip().str.capitalize()  # Converts to 'Male' or 'Female'

In [8]:
df.head(5)

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


## Data Tpyes Check

In [10]:
df.dtypes

CustomerID                 int64
Gender                    object
Age                        int64
Annual Income (k$)         int64
Spending Score (1-100)     int64
dtype: object

##  Save cleaned dataset

In [14]:
df.to_csv('Mall_Customers_Cleaned.csv', index=False)

In [17]:
#7. Output cleaned dataframe
print("\nCleaned Data Preview:")
print(df.head(10))


Cleaned Data Preview:
   CustomerID  Gender  Age  Annual Income (k$)  Spending Score (1-100)
0           1    Male   19                  15                      39
1           2    Male   21                  15                      81
2           3  Female   20                  16                       6
3           4  Female   23                  16                      77
4           5  Female   31                  17                      40
5           6  Female   22                  17                      76
6           7  Female   35                  18                       6
7           8  Female   23                  18                      94
8           9    Male   64                  19                       3
9          10  Female   30                  19                      72


# Objective:

## To clean and preprocess a raw dataset by handling missing values, duplicates, inconsistent formats, and preparing it for analysis or visualization.

### Key Steps Performed:

* Imported Data using pandas
* Handled Missing Values:
* Used .isnull() and .fillna() for null detection and imputation
* Removed Duplicate Rows:
  Used .duplicated() and .drop_duplicates()
* Standardized Column Names and Formats:
Cleaned inconsistent entries (e.g., gender, country names)
* Exported Cleaned Dataset:
Saved as a cleaned CSV for next steps

* Tools Used:
Python,
Pandas,
Jupyter Notebook