# Task 4.9 - Python Visualization Part 1

### Table of Contents

1. Importing libraries and data
2. Checking basic data facts
3. Wrangling data
4. Data quality and consistency checks
5. Combining cleaned/checked orders data with customer data
6. Exporting data

#### 1. Importing libraries and data

In [1]:
# Importing libraries

import pandas as pd
import os
import numpy as np

In [2]:
# Identifying project path

project =r'D:\Adam\Employment\Data Analysis Course\Python Instacart project'

# Importing customer data set

df = pd.read_csv(os.path.join(project, '02 Data', 'Original Data', 'customers.csv'), index_col = False)

#### 2. checking basic data facts

In [3]:
df.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [4]:
df.shape

(206209, 10)

In [5]:
df.describe()

Unnamed: 0,user_id,Age,n_dependants,income
count,206209.0,206209.0,206209.0,206209.0
mean,103105.0,49.501646,1.499823,94632.852548
std,59527.555167,18.480962,1.118433,42473.786988
min,1.0,18.0,0.0,25903.0
25%,51553.0,33.0,0.0,59874.0
50%,103105.0,49.0,1.0,93547.0
75%,154657.0,66.0,3.0,124244.0
max,206209.0,81.0,3.0,593901.0


In [6]:
df['date_joined'].max()

'9/9/2019'

In [7]:
df['date_joined'].min()

'1/1/2017'

In [8]:
df.dtypes

user_id          int64
First Name      object
Surnam          object
Gender          object
STATE           object
Age              int64
date_joined     object
n_dependants     int64
fam_status      object
income           int64
dtype: object

#### 3. Wrangling Data

In [9]:
# changing user_id type to string

df['user_id'] = df['user_id'].astype('str')

In [10]:
# dropping names and date_joined columns because they don't contribute to our analysis

df = df.drop(columns = ['First Name', 'Surnam', 'date_joined'])

In [11]:
df.head()

Unnamed: 0,user_id,Gender,STATE,Age,n_dependants,fam_status,income
0,26711,Female,Missouri,48,3,married,165665
1,33890,Female,New Mexico,36,0,single,59285
2,65803,Male,Idaho,35,2,married,99568
3,125935,Female,Iowa,40,0,single,42049
4,130797,Female,Maryland,26,1,married,40374


In [12]:
# renaming columns to be consistent in naming style

df.rename(columns = {'Gender' : 'gender'}, inplace = True)
df.rename(columns = {'STATE' : 'state'}, inplace = True)
df.rename(columns = {'Age' : 'age'}, inplace = True)

In [13]:
df.head()

Unnamed: 0,user_id,gender,state,age,n_dependants,fam_status,income
0,26711,Female,Missouri,48,3,married,165665
1,33890,Female,New Mexico,36,0,single,59285
2,65803,Male,Idaho,35,2,married,99568
3,125935,Female,Iowa,40,0,single,42049
4,130797,Female,Maryland,26,1,married,40374


#### 4. Data quality and consistency checks

In [14]:
# Checking for missing values for each column

df['user_id'].isnull().sum()

0

In [15]:
df['gender'].value_counts(dropna = False)

Male      104067
Female    102142
Name: gender, dtype: int64

In [16]:
df['state'].value_counts(dropna = False)

Florida                 4044
Colorado                4044
Illinois                4044
Alabama                 4044
District of Columbia    4044
Hawaii                  4044
Arizona                 4044
Connecticut             4044
California              4044
Indiana                 4044
Arkansas                4044
Alaska                  4044
Delaware                4044
Iowa                    4044
Idaho                   4044
Georgia                 4044
Wyoming                 4043
Mississippi             4043
Oklahoma                4043
Utah                    4043
New Hampshire           4043
Kentucky                4043
Maryland                4043
Rhode Island            4043
Massachusetts           4043
Michigan                4043
New Jersey              4043
Kansas                  4043
South Dakota            4043
Minnesota               4043
Tennessee               4043
New York                4043
Washington              4043
Louisiana               4043
Montana       

In [17]:
# I find it interesting that there is a (mostly) equal number of customers from each state.
# So clearly this is a subset of overall instagram data (or a fabricated set), selected to be equal numbers for each state.

In [18]:
df['age'].value_counts(dropna = False)

19    3329
55    3317
51    3317
56    3306
32    3305
      ... 
65    3145
25    3127
66    3114
50    3102
36    3101
Name: age, Length: 64, dtype: int64

In [19]:
df['age'].isnull().sum()

0

In [20]:
df['n_dependants'].value_counts(dropna = False)

0    51602
3    51594
1    51531
2    51482
Name: n_dependants, dtype: int64

In [21]:
# these numbers above are also remarkably similar... maybe the whole dataset is fabricated

In [22]:
df['fam_status'].value_counts(dropna = False)

married                             144906
single                               33962
divorced/widowed                     17640
living with parents and siblings      9701
Name: fam_status, dtype: int64

In [23]:
df['income'].isnull().sum()

0

In [24]:
# no missing values in any of my columns

In [25]:
# looking for duplicates

df.duplicated()

0         False
1         False
2         False
3         False
4         False
          ...  
206204    False
206205    False
206206    False
206207    False
206208    False
Length: 206209, dtype: bool

In [26]:
df_dups = df[df.duplicated()]

In [27]:
df_dups

Unnamed: 0,user_id,gender,state,age,n_dependants,fam_status,income


In [28]:
# no duplicates

In [29]:
# I could've also tried this method for finding duplicates and compared it to original number in dataframe
# this is probably a quicker method for checking for duplicates

df.duplicated().sum()

0

In [30]:
# The data all seems completely clean already

#### 5. Combining cleaned/checked customer data with other instacart data

In [32]:
# Importing other instacart data

df_orders = pd.read_pickle(os.path.join(project, '02 Data', 'Prepared Data', 'orders_products_new_vars.pkl'))

In [33]:
df_orders.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order,new_customer,product_id,add_to_cart_order,reordered,...,price_range_loc,busiest day,busiest days,busiest_period_of_day,max_order,loyalty_flag,avg_item_price,spender_type,avg_days_since_last_order,order_frequency
0,2539329,1,1,2,8,,True,196,1,0,...,Mid-range product,Regularly busy,Regularly busy,Average orders,10,Newer customer,6.367797,Low spender,20.259259,Non-frequent customer
1,2398795,1,2,3,7,15.0,False,196,1,1,...,Mid-range product,Regularly busy,Least busy,Average orders,10,Newer customer,6.367797,Low spender,20.259259,Non-frequent customer
2,473747,1,3,3,12,21.0,False,196,1,1,...,Mid-range product,Regularly busy,Least busy,Most orders,10,Newer customer,6.367797,Low spender,20.259259,Non-frequent customer
3,2254736,1,4,4,7,29.0,False,196,1,1,...,Mid-range product,Least busy,Least busy,Average orders,10,Newer customer,6.367797,Low spender,20.259259,Non-frequent customer
4,431534,1,5,4,15,28.0,False,196,1,1,...,Mid-range product,Least busy,Least busy,Most orders,10,Newer customer,6.367797,Low spender,20.259259,Non-frequent customer


In [34]:
df_orders['user_id'].dtype

dtype('int64')

In [35]:
# changing user_id type to string

df_orders['user_id'] = df_orders['user_id'].astype('str')

In [36]:
df_orders['user_id'].dtype

dtype('O')

In [37]:
# merging orders data with user data

df_merged = df_orders.merge(df, on = 'user_id', indicator = True)

In [38]:
df_merged.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order,new_customer,product_id,add_to_cart_order,reordered,...,spender_type,avg_days_since_last_order,order_frequency,gender,state,age,n_dependants,fam_status,income,_merge
0,2539329,1,1,2,8,,True,196,1,0,...,Low spender,20.259259,Non-frequent customer,Female,Alabama,31,3,married,40423,both
1,2398795,1,2,3,7,15.0,False,196,1,1,...,Low spender,20.259259,Non-frequent customer,Female,Alabama,31,3,married,40423,both
2,473747,1,3,3,12,21.0,False,196,1,1,...,Low spender,20.259259,Non-frequent customer,Female,Alabama,31,3,married,40423,both
3,2254736,1,4,4,7,29.0,False,196,1,1,...,Low spender,20.259259,Non-frequent customer,Female,Alabama,31,3,married,40423,both
4,431534,1,5,4,15,28.0,False,196,1,1,...,Low spender,20.259259,Non-frequent customer,Female,Alabama,31,3,married,40423,both


In [40]:
# checking for consistency in new merged data

df_orders.shape

(32404859, 24)

In [41]:
df_merged.shape

(32404859, 31)

In [42]:
df_merged['_merge'].value_counts()

both          32404859
left_only            0
right_only           0
Name: _merge, dtype: int64

In [43]:
# of course I expected "both" since I merged on df_orders and it would only keep values that had the same
# user_id in both df_orders and df.  But the "shape" above shows that I have the same number of records
# as I started with in df_orders.  So apparently every customer in df_orders was in df

In [44]:
# dropping indicator column from the merge

df_merged = df_merged.drop(columns = ['_merge'])

In [45]:
df_merged.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order,new_customer,product_id,add_to_cart_order,reordered,...,avg_item_price,spender_type,avg_days_since_last_order,order_frequency,gender,state,age,n_dependants,fam_status,income
0,2539329,1,1,2,8,,True,196,1,0,...,6.367797,Low spender,20.259259,Non-frequent customer,Female,Alabama,31,3,married,40423
1,2398795,1,2,3,7,15.0,False,196,1,1,...,6.367797,Low spender,20.259259,Non-frequent customer,Female,Alabama,31,3,married,40423
2,473747,1,3,3,12,21.0,False,196,1,1,...,6.367797,Low spender,20.259259,Non-frequent customer,Female,Alabama,31,3,married,40423
3,2254736,1,4,4,7,29.0,False,196,1,1,...,6.367797,Low spender,20.259259,Non-frequent customer,Female,Alabama,31,3,married,40423
4,431534,1,5,4,15,28.0,False,196,1,1,...,6.367797,Low spender,20.259259,Non-frequent customer,Female,Alabama,31,3,married,40423


#### 6. Exporting as pickle

In [46]:
# exporting df_merged as pickle file

df_merged.to_pickle(os.path.join(project, '02 Data','Prepared Data', 'ic_merged.pkl'))