# TASK: 1 - Data Quality Assessment
### Assessment of data quality and completeness in preparation for analysis.
(**Customer Demographic and Customer Address data in the past 3 months**)

### Customer Demographic

In [2]:
# importing required required libraries 
import pandas as pd
import numpy as np

### Reading data

In [3]:
import os
file_name = 'KPMG_VI_New_raw_data_update_final.xlsx'

import warnings
warnings.filterwarnings("ignore")

# Get the current working directory
current_directory = os.getcwd()

# Create the complete file path by joining the directory and file name
file_path = os.path.join(current_directory, file_name)
# importing data from "Customer Demographic" sheet
Customer_Demographic=pd.read_excel(file_path,sheet_name="CustomerDemographic")

### Exploring transaction data

In [4]:
Customer_Demographic.head(5)

Unnamed: 0,customer_id,first_name,last_name,gender,past_3_years_bike_related_purchases,DOB,job_title,job_industry_category,wealth_segment,deceased_indicator,default,owns_car,tenure
0,1,Laraine,Medendorp,F,93,1953-10-12,Executive Secretary,Health,Mass Customer,N,"""'",Yes,11.0
1,2,Eli,Bockman,Male,81,1980-12-16,Administrative Officer,Financial Services,Mass Customer,N,<script>alert('hi')</script>,Yes,16.0
2,3,Arlin,Dearle,Male,61,1954-01-20,Recruiting Manager,Property,Mass Customer,N,2018-02-01 00:00:00,Yes,15.0
3,4,Talbot,,Male,33,1961-10-03,,IT,Mass Customer,N,() { _; } >_[$($())] { touch /tmp/blns.shellsh...,No,7.0
4,5,Sheila-kathryn,Calton,Female,56,1977-05-13,Senior Editor,,Affluent Customer,N,NIL,Yes,8.0


In [5]:
# checking the data
print("The shape of data->",Customer_Demographic.shape)
print()
Customer_Demographic.info()

The shape of data-> (4000, 13)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 13 columns):
 #   Column                               Non-Null Count  Dtype         
---  ------                               --------------  -----         
 0   customer_id                          4000 non-null   int64         
 1   first_name                           4000 non-null   object        
 2   last_name                            3875 non-null   object        
 3   gender                               4000 non-null   object        
 4   past_3_years_bike_related_purchases  4000 non-null   int64         
 5   DOB                                  3913 non-null   datetime64[ns]
 6   job_title                            3494 non-null   object        
 7   job_industry_category                3344 non-null   object        
 8   wealth_segment                       4000 non-null   object        
 9   deceased_indicator                   4000 non-null   

#### Initially there is 4000 rows and 13 coloumn

In [6]:
# checking the null values
Customer_Demographic.isnull().sum()

customer_id                              0
first_name                               0
last_name                              125
gender                                   0
past_3_years_bike_related_purchases      0
DOB                                     87
job_title                              506
job_industry_category                  656
wealth_segment                           0
deceased_indicator                       0
default                                302
owns_car                                 0
tenure                                  87
dtype: int64

#### There is null values in 6 coloumns.We can drop them or replace them accroding to the nature of analysis or data trend.

*Although we prefer to replace the values of numerical column only , there no need to replacs the null values of categorical & identical column.*

#### We can  change all data value into one case 

In [7]:
def change_upper(df):
    for column in df.columns:
        if df[column].dtype=='object':
            df[column] = df[column].str.upper()
    return df

#### we can use change_upper fuction changing the data values to upper case
*so there is consistency in the records*

#### Checking the duplicate values

In [8]:
Customer_Demographic.duplicated().sum()

0

#### There is no duplicate records present in the data set.
*so all records are unique.*

### Exploring coloumns

#### Generate a summary of count of all the outliers column wise

In [10]:
def detect_outliers_iqr(dataframe,column_name):
    # Calculate quartiles
    Q1 = dataframe[column_name].quantile(0.25)
    Q3 = dataframe[column_name].quantile(0.75)
    
    # Calculate IQR
    IQR = Q3 - Q1
    
    # Calculate lower and upper bounds
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Find outliers
    outliers = dataframe[(dataframe[column_name] < lower_bound) | (dataframe[column_name] > upper_bound)]
    
    return outliers

In [11]:
for column in Customer_Demographic:
    if Customer_Demographic[column].dtype=='int64' or Customer_Demographic[column].dtype=='float64':
        outliers=detect_outliers_iqr(Customer_Demographic,column)
        print(f"the no. of outlier in {column} is:")
        print(outliers[column].count())

the no. of outlier in customer_id is:
0
the no. of outlier in past_3_years_bike_related_purchases is:
0
the no. of outlier in tenure is:
0


#### There is no outlier found in customer demografic data set

#### Checking the categorical columns , if there is different entry or values indiacating same category then change it as one category

In [42]:
Customer_Demographic["gender"].unique()

array(['F', 'Male', 'Female', 'U', 'Femal', 'M'], dtype=object)

#### As per gender category there is there must be three category Male(M), Female(F) and Unspecified(U).So we can consider F,Female,Femal as F and Male,M as M and U.

In [43]:
Customer_Demographic["job_title"].unique().tolist()

['Executive Secretary',
 'Administrative Officer',
 'Recruiting Manager',
 nan,
 'Senior Editor',
 'Media Manager I',
 'Business Systems Development Analyst',
 'Senior Quality Engineer',
 'Nuclear Power Engineer',
 'Developer I',
 'Account Executive',
 'Junior Executive',
 'Media Manager IV',
 'Sales Associate',
 'Professor',
 'Geological Engineer',
 'Project Manager',
 'Safety Technician I',
 'Research Assistant I',
 'Accounting Assistant III',
 'Editor',
 'Research Nurse',
 'Safety Technician III',
 'Staff Accountant III',
 'Legal Assistant',
 'Product Engineer',
 'Information Systems Manager',
 'VP Quality Control',
 'Social Worker',
 'Senior Cost Accountant',
 'Assistant Media Planner',
 'Payment Adjustment Coordinator',
 'Food Chemist',
 'Accountant III',
 'Director of Sales',
 'Senior Financial Analyst',
 'Registered Nurse',
 'Biostatistician II',
 'Computer Systems Analyst II',
 'Software Test Engineer II',
 'Paralegal',
 'VP Sales',
 'Chief Design Engineer',
 'Office Assistant 

In [44]:
Customer_Demographic["job_industry_category"].unique()

array(['Health', 'Financial Services', 'Property', 'IT', nan, 'Retail',
       'Argiculture', 'Manufacturing', 'Telecommunications',
       'Entertainment'], dtype=object)

In [45]:
Customer_Demographic["wealth_segment"].unique()

array(['Mass Customer', 'Affluent Customer', 'High Net Worth'],
      dtype=object)

In [46]:
Customer_Demographic["deceased_indicator"].unique()

array(['N', 'Y'], dtype=object)

#### We can drop the records where deceased indicator denoted as 'Y'

In [47]:
Customer_Demographic["owns_car"].unique()

array(['Yes', 'No'], dtype=object)

#### There is no such data that contradict the values of categorical columns other than gender.
*So we only have to change the records for gender coloumns*

*there is some null value as we saw earlier*

#### As DOB column we can calculate the age of the customer thugh wer can sort out the irrelevant customer as per product or nature of analysis

In [50]:
import datetime as dt
# finding current date 
current_date=dt.date.today()
# changing the values as date & time delta index
Customer_Demographic["DOB"]=pd.to_datetime(Customer_Demographic["DOB"])
current_date=pd.to_datetime(current_date)

#calculate age
Customer_Demographic["age"]=(current_date-Customer_Demographic["DOB"]).astype('<m8[Y]')

In [54]:
age_range=(Customer_Demographic['age'].min(),Customer_Demographic['age'].max())
age_range

(21.0, 179.0)

#### As per records given there age range is to high and irrelevant for product , so we can sort out the customer who have age less than 65.

In [58]:
Customer_Demographic[Customer_Demographic['age']<65.0].head()

Unnamed: 0,customer_id,first_name,last_name,gender,past_3_years_bike_related_purchases,DOB,job_title,job_industry_category,wealth_segment,deceased_indicator,default,owns_car,tenure,age
1,2,Eli,Bockman,Male,81,1980-12-16,Administrative Officer,Financial Services,Mass Customer,N,<script>alert('hi')</script>,Yes,16.0,42.0
3,4,Talbot,,Male,33,1961-10-03,,IT,Mass Customer,N,() { _; } >_[$($())] { touch /tmp/blns.shellsh...,No,7.0,62.0
4,5,Sheila-kathryn,Calton,Female,56,1977-05-13,Senior Editor,,Affluent Customer,N,NIL,Yes,8.0,46.0
5,6,Curr,Duckhouse,Male,35,1966-09-16,,Retail,High Net Worth,N,ðµ ð ð ð,Yes,13.0,57.0
6,7,Fina,Merali,Female,6,1976-02-23,,Financial Services,Affluent Customer,N,â°â´âµâââ,Yes,11.0,47.0


#### We can drop the Default column cause that column do not give us any meaningfull insight 

### Customer Address 

### Reading data

In [12]:
# importing data from "Customer Demographic" sheet
Customer_Address=pd.read_excel(file_path,sheet_name="CustomerAddress")

### Exploring transaction data

In [13]:
Customer_Address.head(5)

Unnamed: 0,customer_id,address,postcode,state,country,property_valuation
0,1,060 Morning Avenue,2016,New South Wales,Australia,10
1,2,6 Meadow Vale Court,2153,New South Wales,Australia,10
2,4,0 Holy Cross Court,4211,QLD,Australia,9
3,5,17979 Del Mar Point,2448,New South Wales,Australia,4
4,6,9 Oakridge Court,3216,VIC,Australia,9


In [14]:
# checking the data
print("The shape of data->",Customer_Address.shape)
print()
Customer_Address.info()

The shape of data-> (3999, 6)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3999 entries, 0 to 3998
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   customer_id         3999 non-null   int64 
 1   address             3999 non-null   object
 2   postcode            3999 non-null   int64 
 3   state               3999 non-null   object
 4   country             3999 non-null   object
 5   property_valuation  3999 non-null   int64 
dtypes: int64(3), object(3)
memory usage: 187.6+ KB


#### Initially there is 3999 rows and 6 coloumn

In [15]:
# checking the null values
Customer_Address.isnull().sum()

customer_id           0
address               0
postcode              0
state                 0
country               0
property_valuation    0
dtype: int64

#### There is no null values in data set.

#### We can  change all data value into one case 

#### we can use change_upper fuction changing the data values to upper case
*so there is consistency in the records*

#### Checking the duplicate values

In [18]:
Customer_Address.duplicated().sum()

0

#### There is no duplicate records present in the data set.
*so all records are unique.*

### Exploring coloumns

#### Generate a summary of count of all the outliers column wise

In [19]:
for column in Customer_Address:
    if Customer_Address[column].dtype=='int64' or Customer_Address[column].dtype=='float64':
        outliers=detect_outliers_iqr(Customer_Address,column)
        print(f"the no. of outlier in {column} is:")
        print(outliers[column].count())

the no. of outlier in customer_id is:
0
the no. of outlier in postcode is:
0
the no. of outlier in property_valuation is:
0


#### There is no outlier found in customer address data set

#### Checking the categorical columns , if there is different entry or values indiacating same category then change it as one category

In [27]:
Customer_Address["state"].unique()

array(['New South Wales', 'QLD', 'VIC', 'NSW', 'Victoria'], dtype=object)

#### As per state column , "New South Wales" or 'NSW' and "Victoria" or 'VIC' is same. So we can cange the as 'NSW' and ' VIC'.

In [28]:
Customer_Address["country"].unique()

array(['Australia'], dtype=object)