# TASK: 1 - Data Quality Assessment
### Assessment of data quality and completeness in preparation for analysis.
(**New Customer Data data**)

In [2]:
# importing required libraries

import pandas as pd
import numpy as np
import datetime as dt

### Reading Data

In [3]:
import os
file_name='KPMG_VI_New_raw_data_update_final.xlsx'

import warnings
warnings.filterwarnings("ignore")

# get current directory
current_directory=os.getcwd()

# create complete file path to join current diectory and file name
file_path=os.path.join(current_directory,file_name)
new_customer_list=pd.read_excel(file_path,sheet_name="NewCustomerList")

### Exploring New customer list

In [4]:
new_customer_list.head(5)

Unnamed: 0,first_name,last_name,gender,past_3_years_bike_related_purchases,DOB,job_title,job_industry_category,wealth_segment,deceased_indicator,owns_car,...,state,country,property_valuation,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Rank,Value
0,Chickie,Brister,Male,86,1957-07-12,General Manager,Manufacturing,Mass Customer,N,Yes,...,QLD,Australia,6,0.73,0.9125,1.140625,0.969531,1,1,1.71875
1,Morly,Genery,Male,69,1970-03-22,Structural Engineer,Property,Mass Customer,N,No,...,NSW,Australia,11,1.01,1.01,1.2625,1.073125,1,1,1.71875
2,Ardelis,Forrester,Female,10,1974-08-28,Senior Cost Accountant,Financial Services,Affluent Customer,N,No,...,VIC,Australia,5,0.43,0.43,0.43,0.43,1,1,1.71875
3,Lucine,Stutt,Female,64,1979-01-28,Account Representative III,Manufacturing,Affluent Customer,N,Yes,...,QLD,Australia,1,0.77,0.9625,0.9625,0.9625,4,4,1.703125
4,Melinda,Hadlee,Female,34,1965-09-21,Financial Analyst,Financial Services,Affluent Customer,N,No,...,NSW,Australia,9,0.63,0.63,0.7875,0.7875,4,4,1.703125


#### Checking the Dimensions

In [9]:
print(new_customer_list.shape)

new_customer_list.info()

(1000, 23)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 23 columns):
 #   Column                               Non-Null Count  Dtype         
---  ------                               --------------  -----         
 0   first_name                           1000 non-null   object        
 1   last_name                            971 non-null    object        
 2   gender                               1000 non-null   object        
 3   past_3_years_bike_related_purchases  1000 non-null   int64         
 4   DOB                                  983 non-null    datetime64[ns]
 5   job_title                            894 non-null    object        
 6   job_industry_category                835 non-null    object        
 7   wealth_segment                       1000 non-null   object        
 8   deceased_indicator                   1000 non-null   object        
 9   owns_car                             1000 non-null   object        
 10  te

#### Initially Data Set have 1000 records/rows and 23 column.
*we can check the new customer list for data driven decision*

#### Checking and eliminating the null values

In [10]:
new_customer_list.isnull().sum()

first_name                               0
last_name                               29
gender                                   0
past_3_years_bike_related_purchases      0
DOB                                     17
job_title                              106
job_industry_category                  165
wealth_segment                           0
deceased_indicator                       0
owns_car                                 0
tenure                                   0
address                                  0
postcode                                 0
state                                    0
country                                  0
property_valuation                       0
Unnamed: 16                              0
Unnamed: 17                              0
Unnamed: 18                              0
Unnamed: 19                              0
Unnamed: 20                              0
Rank                                     0
Value                                    0
dtype: int6

#### There is 4 columns have the null values, We can drop or replace them as per nature of analysis

#### We can change all the records in one case
*It amintain consistency in the records*

#### Checking theduplicate values

In [16]:
new_customer_list.duplicated().sum()

0

#### There is no such duplicate records inthe data set.

### Exploring Columns

#### Checcking Outhliers

In [22]:
def check_outliers(dataframe,columns):
    q1=dataframe[columns].quantile(0.25)
    q2=dataframe[columns].quantile(0.75)
    
    # calculate IQR
    IQR=q2-q1
    
    # Range of lower band and upper band
    lower_band=q1-(1.5*IQR)
    upper_band=q2+(1.5*IQR)
    
    outliers=dataframe[(dataframe[columns]<lower_band) | (dataframe[columns]>upper_band)]
    
    return outliers



for columns in new_customer_list:
    if new_customer_list[columns].dtype=="int64" or new_customer_list[columns].dtype=="float64":
        print(f"The outliers in {columns} is",check_outliers(new_customer_list,columns)[columns].count())

The outliers in past_3_years_bike_related_purchases is 0
The outliers in tenure is 0
The outliers in postcode is 0
The outliers in property_valuation is 30
The outliers in Unnamed: 16 is 0
The outliers in Unnamed: 17 is 0
The outliers in Unnamed: 18 is 0
The outliers in Unnamed: 19 is 2
The outliers in Unnamed: 20 is 0
The outliers in Rank is 0
The outliers in Value is 3


#### can be changed as per nature of analysis

In [31]:
new_customer_list["gender"].unique()

array(['Male', 'Female', 'U'], dtype=object)

In [32]:
new_customer_list["wealth_segment"].unique()

array(['Mass Customer', 'Affluent Customer', 'High Net Worth'],
      dtype=object)

In [33]:
new_customer_list["deceased_indicator"].unique()

array(['N'], dtype=object)

In [34]:
new_customer_list["owns_car"].unique()

array(['Yes', 'No'], dtype=object)

In [35]:
new_customer_list["state"].unique()

array(['QLD', 'NSW', 'VIC'], dtype=object)

In [36]:
new_customer_list["country"].unique()

array(['Australia'], dtype=object)

In [40]:
import datetime as dt

current_date=dt.date.today()

current_date=pd.to_datetime(current_date)
new_customer_list["DOB"]=pd.to_datetime(new_customer_list["DOB"])

new_customer_list["age"]=(current_date-new_customer_list["DOB"]).astype('<m8[Y]')

In [44]:
new_customer_list[["DOB","age"]]

Unnamed: 0,DOB,age
0,1957-07-12,66.0
1,1970-03-22,53.0
2,1974-08-28,49.0
3,1979-01-28,44.0
4,1965-09-21,58.0
...,...,...
995,1959-10-07,64.0
996,2001-10-17,22.0
997,1954-10-05,69.0
998,1952-12-17,70.0


#### We can take the age range 20 to 60 year. As per the nature of analysis and product specific.

#### There in 5 unamed or calculated column in the data set we can drop them for data analysis.