In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import datetime

In [9]:
# reading the customer address table
cust_address = pd.read_excel('KPMG_VI_New_raw_data_update_final.xlsx', sheet_name='CustomerAddress', header=1)
cust_address.sample()

Unnamed: 0,customer_id,address,postcode,state,country,property_valuation,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25
905,910,60086 Summit Lane,2116,NSW,Australia,10,,,,,...,,,,,,,,,,


In [10]:
# getting table shape
cust_address.shape

(3999, 26)

In [11]:
# getting list of all the columns
cust_address.columns

Index(['customer_id', 'address', 'postcode', 'state', 'country',
       'property_valuation', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8',
       'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12',
       'Unnamed: 13', 'Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16',
       'Unnamed: 17', 'Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20',
       'Unnamed: 21', 'Unnamed: 22', 'Unnamed: 23', 'Unnamed: 24',
       'Unnamed: 25'],
      dtype='object')

In [13]:
# removing 'Unnamed' columns
unnanmed_columns = [column for column in cust_address.columns if 'Unnamed' in column]
cust_address.drop(columns=unnanmed_columns, inplace=True)
cust_address.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3999 entries, 0 to 3998
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   customer_id         3999 non-null   int64 
 1   address             3999 non-null   object
 2   postcode            3999 non-null   int64 
 3   state               3999 non-null   object
 4   country             3999 non-null   object
 5   property_valuation  3999 non-null   int64 
dtypes: int64(3), object(3)
memory usage: 187.6+ KB


### Null values

In [14]:
# checkig for Null values
cust_address.isna().sum()

customer_id           0
address               0
postcode              0
state                 0
country               0
property_valuation    0
dtype: int64

*No Null value is present.*

### Numerical Features

In [15]:
# getting list of all the numerical variables
numerical_features = [feature for feature in cust_address.columns if cust_address[feature].dtype != 'O']
numerical_features

['customer_id', 'postcode', 'property_valuation']

In [16]:
# checking for invalid postcode
print(cust_address.postcode.sample(5), '\n')
print(cust_address[cust_address.postcode.astype(str).str.len() != 4])

3234    3793
1760    4567
1595    4220
1897    3750
384     2047
Name: postcode, dtype: int64 

Empty DataFrame
Columns: [customer_id, address, postcode, state, country, property_valuation]
Index: []


*No invalid Postal Code found in the dataset*

### Categorical Features

In [17]:
# getting list of all the categorical variables
categorical_features = [feature for feature in cust_address.columns if cust_address[feature].dtype == 'O']
categorical_features

['address', 'state', 'country']

In [18]:
# checking for values of the state
cust_address.state.unique()

array(['New South Wales', 'QLD', 'VIC', 'NSW', 'Victoria'], dtype=object)

In [19]:
# replacing NSW with New South Wales
cust_address.state = cust_address.state.replace('New South Wales', 'NSW')
cust_address.state = cust_address.state.replace('Victoria', 'VIC')

In [20]:
# rechecking for values of the state
cust_address.state.unique()

array(['NSW', 'QLD', 'VIC'], dtype=object)

## Exploratory Data Analysis

In [21]:
cust_address.head(5)

Unnamed: 0,customer_id,address,postcode,state,country,property_valuation
0,1,060 Morning Avenue,2016,NSW,Australia,10
1,2,6 Meadow Vale Court,2153,NSW,Australia,10
2,4,0 Holy Cross Court,4211,QLD,Australia,9
3,5,17979 Del Mar Point,2448,NSW,Australia,4
4,6,9 Oakridge Court,3216,VIC,Australia,9


In [23]:
# saving as excel
cust_address.to_excel(r'C:\Users\Mayank\Downloads\Customer Address.xlsx', index=False)