In [13]:
import pandas as pd

# Load the Customers dataset with a different encoding
customers_df = pd.read_csv('Customers.csv', encoding='ISO-8859-1')

# Display the first few rows of the dataset to inspect
print(customers_df.head())


   CustomerKey  Gender               Name            City State Code  \
0          301  Female      Lilly Harding  WANDEARAH EAST         SA   
1          325  Female       Madison Hull      MOUNT BUDD         WA   
2          554  Female      Claire Ferres       WINJALLOK        VIC   
3          786    Male  Jai Poltpalingada    MIDDLE RIVER         SA   
4         1042    Male    Aidan Pankhurst   TAWONGA SOUTH        VIC   

               State Zip Code    Country  Continent    Birthday  
0    South Australia     5523  Australia  Australia    7/3/1939  
1  Western Australia     6522  Australia  Australia   9/27/1979  
2           Victoria     3380  Australia  Australia   5/26/1947  
3    South Australia     5223  Australia  Australia   9/17/1957  
4           Victoria     3698  Australia  Australia  11/19/1965  


In [14]:
# Check for missing values
missing_values = customers_df.isnull().sum()
print("Missing values in each column:")
print(missing_values)


Missing values in each column:
CustomerKey     0
Gender          0
Name            0
City            0
State Code     10
State           0
Zip Code        0
Country         0
Continent       0
Birthday        0
dtype: int64


In [15]:
# Fill missing values in 'State Code' with 'Unknown'
customers_df['State Code'] = customers_df['State Code'].fillna('Unknown')
# Verify the changes
print(customers_df.head())
print(customers_df.isnull().sum())


   CustomerKey  Gender               Name            City State Code  \
0          301  Female      Lilly Harding  WANDEARAH EAST         SA   
1          325  Female       Madison Hull      MOUNT BUDD         WA   
2          554  Female      Claire Ferres       WINJALLOK        VIC   
3          786    Male  Jai Poltpalingada    MIDDLE RIVER         SA   
4         1042    Male    Aidan Pankhurst   TAWONGA SOUTH        VIC   

               State Zip Code    Country  Continent    Birthday  
0    South Australia     5523  Australia  Australia    7/3/1939  
1  Western Australia     6522  Australia  Australia   9/27/1979  
2           Victoria     3380  Australia  Australia   5/26/1947  
3    South Australia     5223  Australia  Australia   9/17/1957  
4           Victoria     3698  Australia  Australia  11/19/1965  
CustomerKey    0
Gender         0
Name           0
City           0
State Code     0
State          0
Zip Code       0
Country        0
Continent      0
Birthday       0
d

In [16]:
# Convert 'Birthday' to datetime format
customers_df['Birthday'] = pd.to_datetime(customers_df['Birthday'], errors='coerce', format='%m/%d/%Y')

# Display the data types of each column to verify
print(customers_df.dtypes)

CustomerKey             int64
Gender                 object
Name                   object
City                   object
State Code             object
State                  object
Zip Code               object
Country                object
Continent              object
Birthday       datetime64[ns]
dtype: object


In [17]:
# Check for duplicates in the key columns
duplicates = customers_df.duplicated(subset=['CustomerKey'])
print(f"Number of duplicate CustomerKeys: {duplicates.sum()}")

# Check for any missing values in key columns (though we've handled it before)
missing_keys = customers_df[['CustomerKey']].isnull().sum()
print(f"Missing values in CustomerKey: {missing_keys}")

# Display unique values for key columns to ensure consistency
unique_customers = customers_df['CustomerKey'].nunique()
total_customers = len(customers_df)
print(f"Unique CustomerKeys: {unique_customers}")
print(f"Total number of customers: {total_customers}")


Number of duplicate CustomerKeys: 0
Missing values in CustomerKey: CustomerKey    0
dtype: int64
Unique CustomerKeys: 15266
Total number of customers: 15266


In [18]:
# Save the cleaned Customers dataset
customers_df.to_csv('cleaned_Customers.csv', index=False)
