In [1]:
# To access python data science libraries and visualise data and render plots in the Jupyter Notebook
import math
import random
import numpy as np
import scipy.stats as stats
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# read in the excel datasets and put into a panda dataframe
xls = pd.ExcelFile('KPMG_VI_New_raw_data_update_final_formatted.xlsx')
pd4 = pd.read_excel(xls, sheet_name=4, header=1)

In [3]:
df4 = pd.DataFrame(pd4)

In [4]:
# check out datatypes, columns name, counts
df4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 23 columns):
 #   Column                               Non-Null Count  Dtype         
---  ------                               --------------  -----         
 0   first_name                           1000 non-null   object        
 1   last_name                            971 non-null    object        
 2   gender                               1000 non-null   object        
 3   past_3_years_bike_related_purchases  1000 non-null   int64         
 4   DOB                                  983 non-null    datetime64[ns]
 5   job_title                            894 non-null    object        
 6   job_industry_category                835 non-null    object        
 7   wealth_segment                       1000 non-null   object        
 8   deceased_indicator                   1000 non-null   object        
 9   owns_car                             1000 non-null   object        
 10  tenure       

In [5]:
# check the first 5 records
df4.head()

Unnamed: 0,first_name,last_name,gender,past_3_years_bike_related_purchases,DOB,job_title,job_industry_category,wealth_segment,deceased_indicator,owns_car,...,state,country,property_valuation,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Rank,Value
0,Chickie,Brister,Male,86,1957-07-12,General Manager,Manufacturing,Mass Customer,N,Yes,...,QLD,Australia,6,0.75,0.9375,1.171875,0.996094,1,1,1.71875
1,Morly,Genery,Male,69,1970-03-22,Structural Engineer,Property,Mass Customer,N,No,...,NSW,Australia,11,0.82,0.82,1.025,0.87125,1,1,1.71875
2,Ardelis,Forrester,Female,10,1974-08-28,Senior Cost Accountant,Financial Services,Affluent Customer,N,No,...,VIC,Australia,5,0.43,0.43,0.43,0.43,1,1,1.71875
3,Lucine,Stutt,Female,64,1979-01-28,Account Representative III,Manufacturing,Affluent Customer,N,Yes,...,QLD,Australia,1,0.48,0.6,0.6,0.6,4,4,1.703125
4,Melinda,Hadlee,Female,34,1965-09-21,Financial Analyst,Financial Services,Affluent Customer,N,No,...,NSW,Australia,9,0.71,0.71,0.8875,0.8875,4,4,1.703125


In [6]:
# check the last 5 records
df4.tail()

Unnamed: 0,first_name,last_name,gender,past_3_years_bike_related_purchases,DOB,job_title,job_industry_category,wealth_segment,deceased_indicator,owns_car,...,state,country,property_valuation,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Rank,Value
995,Ferdinand,Romanetti,Male,60,1959-10-07,Paralegal,Financial Services,Affluent Customer,N,No,...,NSW,Australia,7,0.46,0.46,0.46,0.46,996,996,0.374
996,Burk,Wortley,Male,22,2001-10-17,Senior Sales Associate,Health,Mass Customer,N,No,...,NSW,Australia,10,0.86,0.86,1.075,0.91375,997,997,0.357
997,Melloney,Temby,Female,17,1954-10-05,Budget/Accounting Analyst IV,Financial Services,Affluent Customer,N,Yes,...,QLD,Australia,2,0.98,1.225,1.225,1.225,997,997,0.357
998,Dickie,Cubbini,Male,30,1952-12-17,Financial Advisor,Financial Services,Mass Customer,N,Yes,...,QLD,Australia,2,1.05,1.3125,1.3125,1.115625,997,997,0.357
999,Sylas,Duffill,Male,56,1955-10-02,Staff Accountant IV,Property,Mass Customer,N,Yes,...,NSW,Australia,9,0.96,1.2,1.5,1.275,1000,1000,0.34


In [7]:
# check for null values
df4.isnull().sum()

first_name                               0
last_name                               29
gender                                   0
past_3_years_bike_related_purchases      0
DOB                                     17
job_title                              106
job_industry_category                  165
wealth_segment                           0
deceased_indicator                       0
owns_car                                 0
tenure                                   0
address                                  0
postcode                                 0
state                                    0
country                                  0
property_valuation                       0
Unnamed: 16                              0
Unnamed: 17                              0
Unnamed: 18                              0
Unnamed: 19                              0
Unnamed: 20                              0
Rank                                     0
Value                                    0
dtype: int6

In [8]:
# drop the unnamed columns with unknown datapoints
del df4['Unnamed: 16']
del df4['Unnamed: 17']
del df4['Unnamed: 18']
del df4['Unnamed: 19']
del df4['Unnamed: 20']

In [9]:
# sort by first name, then fill the missing last name
df4['last_name'] = df4.groupby('first_name').last_name.bfill().ffill()

In [10]:
# check the entered data
df4['gender'].value_counts()

Female    513
Male      470
U          17
Name: gender, dtype: int64

In [11]:
# check the entered data
df4['state'].value_counts()

NSW    506
VIC    266
QLD    228
Name: state, dtype: int64

In [12]:
# check the entered data
df4['country'].value_counts()

Australia    1000
Name: country, dtype: int64

In [13]:
# check the entered data
df4['address'].nunique()

1000

In [14]:
# make sure all dates are in the correct format
df4['DOB'] = df4['DOB'].dt.date
df4['DOB']

0      1957-07-12
1      1970-03-22
2      1974-08-28
3      1979-01-28
4      1965-09-21
          ...    
995    1959-10-07
996    2001-10-17
997    1954-10-05
998    1952-12-17
999    1955-10-02
Name: DOB, Length: 1000, dtype: object

In [15]:
# fill the missing date of birth with a random choice of date from the existing datapoints
df4['DOB'].fillna(lambda x: np.random.choice(df4['DOB']), inplace=True)

In [16]:
# check the frequency of different job titles
df4['job_title'].value_counts()

Associate Professor             15
Environmental Tech              14
Software Consultant             14
Chief Design Engineer           13
Assistant Media Planner         12
                                ..
Safety Technician II             1
Administrative Assistant I       1
Human Resources Assistant IV     1
Computer Systems Analyst III     1
Statistician III                 1
Name: job_title, Length: 184, dtype: int64

In [17]:
# fill the missing datapoint using the backward filling method
df4['job_title']= df4['job_title'].fillna(method='bfill')

In [18]:
# check the frequency of job industries
df4['job_industry_category'].value_counts()

Financial Services    203
Manufacturing         199
Health                152
Retail                 78
Property               64
IT                     51
Entertainment          37
Argiculture            26
Telecommunications     25
Name: job_industry_category, dtype: int64

In [19]:
# fill the missing datapoint using the forward filling method
df4['job_industry_category']= df4['job_industry_category'].fillna(method='ffill')

In [20]:
# make sure that the column names are in the correct format
df4.columns = map(str.lower, df4.columns)
df4.columns = map(str.strip, df4.columns)

In [21]:
# round the values to the correct format
df4['value'] = df4['value'].round(3)
df4

Unnamed: 0,first_name,last_name,gender,past_3_years_bike_related_purchases,dob,job_title,job_industry_category,wealth_segment,deceased_indicator,owns_car,tenure,address,postcode,state,country,property_valuation,rank,value
0,Chickie,Brister,Male,86,1957-07-12,General Manager,Manufacturing,Mass Customer,N,Yes,14,45 Shopko Center,4500,QLD,Australia,6,1,1.719
1,Morly,Genery,Male,69,1970-03-22,Structural Engineer,Property,Mass Customer,N,No,16,14 Mccormick Park,2113,NSW,Australia,11,1,1.719
2,Ardelis,Forrester,Female,10,1974-08-28,Senior Cost Accountant,Financial Services,Affluent Customer,N,No,10,5 Colorado Crossing,3505,VIC,Australia,5,1,1.719
3,Lucine,Stutt,Female,64,1979-01-28,Account Representative III,Manufacturing,Affluent Customer,N,Yes,5,207 Annamark Plaza,4814,QLD,Australia,1,4,1.703
4,Melinda,Hadlee,Female,34,1965-09-21,Financial Analyst,Financial Services,Affluent Customer,N,No,19,115 Montana Place,2093,NSW,Australia,9,4,1.703
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,Ferdinand,Romanetti,Male,60,1959-10-07,Paralegal,Financial Services,Affluent Customer,N,No,9,2 Sloan Way,2200,NSW,Australia,7,996,0.374
996,Burk,Wortley,Male,22,2001-10-17,Senior Sales Associate,Health,Mass Customer,N,No,6,04 Union Crossing,2196,NSW,Australia,10,997,0.357
997,Melloney,Temby,Female,17,1954-10-05,Budget/Accounting Analyst IV,Financial Services,Affluent Customer,N,Yes,15,33475 Fair Oaks Junction,4702,QLD,Australia,2,997,0.357
998,Dickie,Cubbini,Male,30,1952-12-17,Financial Advisor,Financial Services,Mass Customer,N,Yes,19,57666 Victoria Way,4215,QLD,Australia,2,997,0.357


In [22]:
df4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 18 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   first_name                           1000 non-null   object 
 1   last_name                            1000 non-null   object 
 2   gender                               1000 non-null   object 
 3   past_3_years_bike_related_purchases  1000 non-null   int64  
 4   dob                                  1000 non-null   object 
 5   job_title                            1000 non-null   object 
 6   job_industry_category                1000 non-null   object 
 7   wealth_segment                       1000 non-null   object 
 8   deceased_indicator                   1000 non-null   object 
 9   owns_car                             1000 non-null   object 
 10  tenure                               1000 non-null   int64  
 11  address                        

In [23]:
#print to csv file after the cleaning
df4.to_csv('./data4_cleaned.csv',index=False)