# 1.0 Preparing the new customer data set for observation/analysis

## List of Content
##### 1.0 Preparing the new customer data set for observation/analysis

- Carrying out wrangling checks
- Conducting consistency and data quality checks

##### 2.0 Import orders_products dataset for merge

- Export df_visual as orders_customers_visual in pickle format

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

In [2]:
# Import the new customer dataset for part 1 of this exercise: as df_new_cust

df_new_cust = pd.read_csv(r'C:\Users\IDONG\Original data\customers_new.csv', index_col = False)

##### 1.1 Carrying out wrangling checks

In [3]:
# Performing normal preview of the dataframe

df_new_cust.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [4]:
df_new_cust.shape

(206209, 10)

In [5]:
# Memory consumed would not be much. For the sake of incoming imported data, its best to preserve space where possible

df_new_cust.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       206209 non-null  int64 
 1   First Name    194950 non-null  object
 2   Surnam        206209 non-null  object
 3   Gender        206209 non-null  object
 4   STATE         206209 non-null  object
 5   Age           206209 non-null  int64 
 6   date_joined   206209 non-null  object
 7   n_dependants  206209 non-null  int64 
 8   fam_status    206209 non-null  object
 9   income        206209 non-null  int64 
dtypes: int64(4), object(6)
memory usage: 15.7+ MB


In [6]:
df_new_cust[['user_id']] = df_new_cust[['user_id']].astype('int32')

In [7]:
df_new_cust[['Age']] = df_new_cust[['Age']].astype('int8')

In [8]:
df_new_cust[['n_dependants']] = df_new_cust[['n_dependants']].astype('int8')

In [9]:
df_new_cust[['income']] = df_new_cust[['income']].astype('int32')

In [10]:
# N.B - The 'Gender' variable could be changed to int. as the values are either male/female
# A quick check

df_new_cust['Gender'].value_counts(dropna = False)

Male      104067
Female    102142
Name: Gender, dtype: int64

In [11]:
# Male can be represented as '1' and female as '2'
# A new variable 'sex' would also be created

df_new_cust.loc[df_new_cust['Gender'] == 'Male', 'sex'] = '1'

In [12]:
df_new_cust.loc[df_new_cust['Gender'] == 'Female', 'sex'] = '2'

In [13]:
df_new_cust['sex'].value_counts(dropna = False)

1    104067
2    102142
Name: sex, dtype: int64

In [14]:
df_new_cust.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income,sex
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665,2
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285,2
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568,1
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049,2
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374,2


In [15]:
# Still trying to conserve memory space

df_new_cust[['sex']] = df_new_cust[['sex']].astype('int8')

In [16]:
# We can now remove the 'Gender' column

df_new_cust = df_new_cust.drop(columns = ['Gender'])

In [17]:
# Confirming the changes made

df_new_cust.head()

Unnamed: 0,user_id,First Name,Surnam,STATE,Age,date_joined,n_dependants,fam_status,income,sex
0,26711,Deborah,Esquivel,Missouri,48,1/1/2017,3,married,165665,2
1,33890,Patricia,Hart,New Mexico,36,1/1/2017,0,single,59285,2
2,65803,Kenneth,Farley,Idaho,35,1/1/2017,2,married,99568,1
3,125935,Michelle,Hicks,Iowa,40,1/1/2017,0,single,42049,2
4,130797,Ann,Gilmore,Maryland,26,1/1/2017,1,married,40374,2


In [18]:
# Some memory has been saved. All in a bid to improve efficiency

df_new_cust.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       206209 non-null  int32 
 1   First Name    194950 non-null  object
 2   Surnam        206209 non-null  object
 3   STATE         206209 non-null  object
 4   Age           206209 non-null  int8  
 5   date_joined   206209 non-null  object
 6   n_dependants  206209 non-null  int8  
 7   fam_status    206209 non-null  object
 8   income        206209 non-null  int32 
 9   sex           206209 non-null  int8  
dtypes: int32(2), int8(3), object(5)
memory usage: 10.0+ MB


In [19]:
# The variable names appear descent and understandable. But some modification will be carried for the sake of uniformity.
# Like all headers should be in lower case ('STATE' to 'state') 
# Headers with two words should be separated by an underscore ('First Name' as 'first_name')

df_new_cust.rename(columns = {'First Name' : 'first_name'}, inplace = True)

In [20]:
df_new_cust.rename(columns = {'Surnam' : 'last_name'}, inplace = True)

In [21]:
df_new_cust.rename(columns = {'STATE' : 'state'}, inplace = True)

In [22]:
df_new_cust.rename(columns = {'Age' : 'age'}, inplace = True)

In [23]:
df_new_cust.rename(columns = {'fam_status' : 'status'}, inplace = True)

In [24]:
# No variable would be dropped yet, until the client objectives are fully understood. Further checks continue

df_new_cust.describe()

Unnamed: 0,user_id,age,n_dependants,income,sex
count,206209.0,206209.0,206209.0,206209.0,206209.0
mean,103105.0,49.501646,1.499823,94632.852548,1.495332
std,59527.555167,18.480962,1.118433,42473.786988,0.499979
min,1.0,18.0,0.0,25903.0,1.0
25%,51553.0,33.0,0.0,59874.0,1.0
50%,103105.0,49.0,1.0,93547.0,1.0
75%,154657.0,66.0,3.0,124244.0,2.0
max,206209.0,81.0,3.0,593901.0,2.0


**Nothing looks awkward from the stats above

##### 1.2 Conducting consistency and data quality checks

In [25]:
# Checking for mixed type data

for col in df_new_cust.columns.tolist():
  weird = (df_new_cust[[col]].applymap(type) != df_new_cust[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_new_cust[weird]) > 0:
    print (col)

first_name


In [26]:
# To make sure all values in the 'first_name' column are objects, we use the command below

df_new_cust['first_name'] = df_new_cust['first_name'].astype('str')

In [27]:
# To cross-check

for col in df_new_cust.columns.tolist():
  weird = (df_new_cust[[col]].applymap(type) != df_new_cust[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_new_cust[weird]) > 0:
    print (col)

In [28]:
# Now checking for possible missing values

df_new_cust.isnull().sum()

user_id         0
first_name      0
last_name       0
state           0
age             0
date_joined     0
n_dependants    0
status          0
income          0
sex             0
dtype: int64

In [29]:
# Checking for duplicates as well

df_new_cust_dup = df_new_cust[df_new_cust.duplicated()]

In [30]:
df_new_cust_dup

Unnamed: 0,user_id,first_name,last_name,state,age,date_joined,n_dependants,status,income,sex


In [31]:
# Final check before merge 

df_new_cust.shape

(206209, 10)

In [32]:
df_new_cust.head()

Unnamed: 0,user_id,first_name,last_name,state,age,date_joined,n_dependants,status,income,sex
0,26711,Deborah,Esquivel,Missouri,48,1/1/2017,3,married,165665,2
1,33890,Patricia,Hart,New Mexico,36,1/1/2017,0,single,59285,2
2,65803,Kenneth,Farley,Idaho,35,1/1/2017,2,married,99568,1
3,125935,Michelle,Hicks,Iowa,40,1/1/2017,0,single,42049,2
4,130797,Ann,Gilmore,Maryland,26,1/1/2017,1,married,40374,2


# 2.0 Import orders_products dataset for merge

In [33]:
# Importing orders_products data set as df_mega

df_mega = pd.read_pickle(r'C:\Users\IDONG\Prepared data\orders_products_grouped_corrected.pkl')

In [34]:
# Checking the imported dataframe

df_mega.shape

(32433030, 22)

In [35]:
# A quick review of the columns to identify a common (key) column

df_mega.columns

Index(['product_id', 'product_name', 'aisle_id', 'department_id', 'prices',
       'order_id', 'user_id', 'order_number', 'orders_day_of_week',
       'order_hour_of_day', 'days_since_prior_order', 'add_to_cart_order',
       'reordered', 'price_range_loc', 'activity_levels', 'busy_period_levels',
       'max_order', 'loyalty_flag', 'av_price', 'spending_flag', 'ord_freq',
       'frequency_flag'],
      dtype='object')

In [36]:
# Checking the memory space of dataframe

df_mega.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32433030 entries, 0 to 32433029
Data columns (total 22 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   product_id              int32  
 1   product_name            object 
 2   aisle_id                int16  
 3   department_id           int8   
 4   prices                  float16
 5   order_id                int32  
 6   user_id                 int32  
 7   order_number            int8   
 8   orders_day_of_week      int8   
 9   order_hour_of_day       int8   
 10  days_since_prior_order  float16
 11  add_to_cart_order       int8   
 12  reordered               int8   
 13  price_range_loc         object 
 14  activity_levels         object 
 15  busy_period_levels      object 
 16  max_order               int8   
 17  loyalty_flag            object 
 18  av_price                float16
 19  spending_flag           object 
 20  ord_freq                float16
 21  frequency_flag          objec

In [37]:
# Before merge it's necessary to remove unwanted columns because of memory space and efficiency. To also avoid system crash
# (N.B - Some needed columns like loyalty_flag were also removed, but can still be re-created subsequently)

df_mega = df_mega.drop(columns = ['activity_levels', 'loyalty_flag', 'spending_flag', 'frequency_flag', 'busy_period_levels', 'order_id', 'product_id', 'ord_freq', 'aisle_id', 'av_price', 'max_order', 'reordered', 'add_to_cart_order', 'price_range_loc'])

In [38]:
# Confirming the effect of the change and memory space recovered.
# The needed columns that form the foundation for recreation of derived variables were maintained. 

df_mega.shape

(32433030, 8)

In [39]:
df_mega.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32433030 entries, 0 to 32433029
Data columns (total 8 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   product_name            object 
 1   department_id           int8   
 2   prices                  float16
 3   user_id                 int32  
 4   order_number            int8   
 5   orders_day_of_week      int8   
 6   order_hour_of_day       int8   
 7   days_since_prior_order  float16
dtypes: float16(2), int32(1), int8(4), object(1)
memory usage: 866.1+ MB


In [41]:
# The merge of the datasets will be represented as 'df_visual'

df_visual = df_new_cust.merge(df_mega, on = 'user_id', indicator = True)

In [42]:
# Checking the dimensions of the new dataframe

df_visual.shape

(32433030, 18)

In [43]:
# Confirming the merge was effective

df_visual['_merge'].value_counts()

both          32433030
left_only            0
right_only           0
Name: _merge, dtype: int64

In [44]:
# Checking the list of columns.

df_visual.columns

Index(['user_id', 'first_name', 'last_name', 'state', 'age', 'date_joined',
       'n_dependants', 'status', 'income', 'sex', 'product_name',
       'department_id', 'prices', 'order_number', 'orders_day_of_week',
       'order_hour_of_day', 'days_since_prior_order', '_merge'],
      dtype='object')

In [45]:
# The 'merge' column would be deleted

df_visual = df_visual.drop(columns = ['_merge'])

In [46]:
# Final check before exporting dataframe

df_visual.shape

(32433030, 17)

##### Export df_visual as orders_customers_visual in pickle format

In [47]:
# Define path

path = r'C:\Users\IDONG'

In [48]:
df_visual.to_pickle(os.path.join(path, 'Prepared Data', 'orders_customers_visual.pkl'))