In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import datetime

In [2]:
# reading the customer demographic table
cust_dem = pd.read_excel('KPMG_VI_New_raw_data_update_final.xlsx', sheet_name='CustomerDemographic', header=1)
cust_dem.sample()

Unnamed: 0,customer_id,first_name,last_name,gender,past_3_years_bike_related_purchases,DOB,job_title,job_industry_category,wealth_segment,deceased_indicator,...,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25
441,442,Linc,Vedyasov,Male,2,2001-10-06,,Financial Services,Mass Customer,N,...,,,,,,,,,,


In [3]:
# getting table shape
cust_dem.shape

(4000, 26)

In [4]:
# getting list of all the columns
cust_dem.columns

Index(['customer_id', 'first_name', 'last_name', 'gender',
       'past_3_years_bike_related_purchases', 'DOB', 'job_title',
       'job_industry_category', 'wealth_segment', 'deceased_indicator',
       'default', 'owns_car', 'tenure', 'Unnamed: 13', 'Unnamed: 14',
       'Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18',
       'Unnamed: 19', 'Unnamed: 20', 'Unnamed: 21', 'Unnamed: 22',
       'Unnamed: 23', 'Unnamed: 24', 'Unnamed: 25'],
      dtype='object')

In [5]:
# removing 'Unnamed' columns
unnanmed_columns = [column for column in cust_dem.columns if 'Unnamed' in column]
cust_dem.drop(columns=unnanmed_columns, inplace=True)
cust_dem.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 13 columns):
 #   Column                               Non-Null Count  Dtype         
---  ------                               --------------  -----         
 0   customer_id                          4000 non-null   int64         
 1   first_name                           4000 non-null   object        
 2   last_name                            3875 non-null   object        
 3   gender                               4000 non-null   object        
 4   past_3_years_bike_related_purchases  4000 non-null   int64         
 5   DOB                                  3913 non-null   datetime64[ns]
 6   job_title                            3494 non-null   object        
 7   job_industry_category                3344 non-null   object        
 8   wealth_segment                       4000 non-null   object        
 9   deceased_indicator                   4000 non-null   object        
 10  default     

In [6]:
# dropping meaningless columns
cust_dem.drop(columns='default', inplace=True)

## Data Preprocessing

### Null Values and Incorrect values

In [7]:
# checking for Null values and percentage of Null values
for column in cust_dem.columns:
    if cust_dem[column].isna().sum() > 0:
        print({column: cust_dem[column].isna().sum() * 100 / len(cust_dem)})

{'last_name': 3.125}
{'DOB': 2.175}
{'job_title': 12.65}
{'job_industry_category': 16.4}
{'tenure': 2.175}


*High level of Null values is present in this table especially for 'job_title' and 'job_industry_category'*

In [8]:
# for analysis purpose we will use age instead of date of birth
cust_dem.DOB = pd.to_datetime(cust_dem.DOB)
cust_dem.insert(6, 'age', datetime.datetime.today().year - pd.to_datetime(cust_dem.DOB).dt.year)

In [9]:
# dropping DOB column
cust_dem.drop(columns='DOB', inplace=True)
cust_dem.sample(5)

Unnamed: 0,customer_id,first_name,last_name,gender,past_3_years_bike_related_purchases,age,job_title,job_industry_category,wealth_segment,deceased_indicator,owns_car,tenure
3852,3853,Kerri,Marrington,Female,91,47.0,Accounting Assistant IV,,Mass Customer,N,Yes,19.0
3406,3407,Marven,Ditts,Male,53,61.0,Cost Accountant,Financial Services,High Net Worth,N,Yes,19.0
712,713,Adena,Northedge,Female,34,28.0,Help Desk Technician,,Mass Customer,N,Yes,4.0
2964,2965,Lynna,Greenrod,Female,24,50.0,,,High Net Worth,N,No,8.0
2301,2302,Loraine,Beamish,Female,43,23.0,Mechanical Systems Engineer,Retail,Mass Customer,N,Yes,4.0


In [10]:
# checkin for any unreal values
cust_dem.age.sort_values(ascending=False)

33      179.0
719      91.0
1091     87.0
3409     82.0
2412     79.0
        ...  
3778      NaN
3882      NaN
3930      NaN
3934      NaN
3997      NaN
Name: age, Length: 4000, dtype: float64

In [11]:
# handling 'age' column
cust_dem.age = cust_dem.age.replace(179, cust_dem.age.median())
cust_dem.age.fillna(cust_dem.age.median(), inplace=True)

In [12]:
print('Largest value for age: {}'.format(cust_dem.age.max()))
print('Number of Null values for age column: {}'.format(cust_dem.age.isna().sum()))

Largest value for age: 91.0
Number of Null values for age column: 0


In [13]:
# Handling Null values

# No need to add anything to Last Name if a cutomer doesn't have one
cust_dem.last_name.fillna(' ', inplace=True)

# handling other null values
for feature in ['job_title', 'job_industry_category']:
    cust_dem[feature].fillna('Missing', inplace=True)

# handling tenure column
cust_dem.tenure.fillna(0, inplace=True)

In [14]:
# rechecking for Null values
cust_dem.isna().sum()

customer_id                            0
first_name                             0
last_name                              0
gender                                 0
past_3_years_bike_related_purchases    0
age                                    0
job_title                              0
job_industry_category                  0
wealth_segment                         0
deceased_indicator                     0
owns_car                               0
tenure                                 0
dtype: int64

In [15]:
cust_dem.sample(5)

Unnamed: 0,customer_id,first_name,last_name,gender,past_3_years_bike_related_purchases,age,job_title,job_industry_category,wealth_segment,deceased_indicator,owns_car,tenure
2732,2733,Jordanna,Skyme,Female,89,37.0,Analog Circuit Design manager,Manufacturing,Mass Customer,N,Yes,17.0
3213,3214,Cindelyn,Balas,Female,2,29.0,Software Engineer III,Financial Services,Mass Customer,N,Yes,9.0
2997,2998,Quintana,Housley,Female,34,22.0,Nurse Practicioner,Manufacturing,High Net Worth,N,Yes,2.0
3210,3211,Betsy,Cosh,Female,52,29.0,Automation Specialist II,Health,High Net Worth,N,Yes,7.0
3508,3509,Janine,Hesey,Female,85,44.0,Pharmacist,Health,Mass Customer,N,Yes,15.0


### Categorical Features

In [16]:
# lsiting all the categorical features
categorical_features = [column for column in cust_dem.columns if cust_dem[column].dtype == 'O' and 
                        column not in ['first_name', 'last_name', 'default']]
categorical_features

['gender',
 'job_title',
 'job_industry_category',
 'wealth_segment',
 'deceased_indicator',
 'owns_car']

In [17]:
# getting unique values of all the categorical features
for feature in categorical_features:
    print(feature)
    print(cust_dem[feature].unique(),'\n')

gender
['F' 'Male' 'Female' 'U' 'Femal' 'M'] 

job_title
['Executive Secretary' 'Administrative Officer' 'Recruiting Manager'
 'Missing' 'Senior Editor' 'Media Manager I'
 'Business Systems Development Analyst' 'Senior Quality Engineer'
 'Nuclear Power Engineer' 'Developer I' 'Account Executive'
 'Junior Executive' 'Media Manager IV' 'Sales Associate' 'Professor'
 'Geological Engineer' 'Project Manager' 'Safety Technician I'
 'Research Assistant I' 'Accounting Assistant III' 'Editor'
 'Research Nurse' 'Safety Technician III' 'Staff Accountant III'
 'Legal Assistant' 'Product Engineer' 'Information Systems Manager'
 'VP Quality Control' 'Social Worker' 'Senior Cost Accountant'
 'Assistant Media Planner' 'Payment Adjustment Coordinator' 'Food Chemist'
 'Accountant III' 'Director of Sales' 'Senior Financial Analyst'
 'Registered Nurse' 'Biostatistician II' 'Computer Systems Analyst II'
 'Software Test Engineer II' 'Paralegal' 'VP Sales'
 'Chief Design Engineer' 'Office Assistant III'
 'Ph

*Gender categories can be changed from {'F', 'Male', 'Female', 'U', 'Femal' and 'M'} to {'Female', 'Male' and 'U}*

In [18]:
# cleaning the gender column
gender_map = {'F': 'Female', 'Femal': 'Female', 'Female': 'Female', 'Male': 'Male', 'M': 'Male', 'U': 'U'}
cust_dem.gender = cust_dem.gender.map(gender_map)

In [19]:
# checking for values in gender column
cust_dem.gender.unique()

array(['Female', 'Male', 'U'], dtype=object)

## Exploratory Data Analysis

In [20]:
cust_dem.head(5)

Unnamed: 0,customer_id,first_name,last_name,gender,past_3_years_bike_related_purchases,age,job_title,job_industry_category,wealth_segment,deceased_indicator,owns_car,tenure
0,1,Laraine,Medendorp,Female,93,69.0,Executive Secretary,Health,Mass Customer,N,Yes,11.0
1,2,Eli,Bockman,Male,81,42.0,Administrative Officer,Financial Services,Mass Customer,N,Yes,16.0
2,3,Arlin,Dearle,Male,61,68.0,Recruiting Manager,Property,Mass Customer,N,Yes,15.0
3,4,Talbot,,Male,33,61.0,Missing,IT,Mass Customer,N,No,7.0
4,5,Sheila-kathryn,Calton,Female,56,45.0,Senior Editor,Missing,Affluent Customer,N,Yes,8.0


In [21]:
# saving file
cust_dem.to_excel(r'C:\Users\Mayank\Downloads\Customer Demographic.xlsx')

In [22]:
cust_dem.head()

Unnamed: 0,customer_id,first_name,last_name,gender,past_3_years_bike_related_purchases,age,job_title,job_industry_category,wealth_segment,deceased_indicator,owns_car,tenure
0,1,Laraine,Medendorp,Female,93,69.0,Executive Secretary,Health,Mass Customer,N,Yes,11.0
1,2,Eli,Bockman,Male,81,42.0,Administrative Officer,Financial Services,Mass Customer,N,Yes,16.0
2,3,Arlin,Dearle,Male,61,68.0,Recruiting Manager,Property,Mass Customer,N,Yes,15.0
3,4,Talbot,,Male,33,61.0,Missing,IT,Mass Customer,N,No,7.0
4,5,Sheila-kathryn,Calton,Female,56,45.0,Senior Editor,Missing,Affluent Customer,N,Yes,8.0
