## This script contains the following points:

### Importing data & libraries
### 3. Importing data & libraries
###    Exploratory Analysis
### 4. Data Wrangling
### 5. Data Consistency Checks
### 6. Combining Data

## Importing Data & Libraries

In [67]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

In [68]:
# assigning path
path=r'C:\Users\Gal-E\CF Data Analysis\Achievement 4\03-2024 Instacart Basket Analysis'

# import "customers.csv" dataframe from "Original Data" folder
df_customers=pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'customers.csv'))

In [69]:
df_customers.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [70]:
df_customers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       206209 non-null  int64 
 1   First Name    194950 non-null  object
 2   Surnam        206209 non-null  object
 3   Gender        206209 non-null  object
 4   STATE         206209 non-null  object
 5   Age           206209 non-null  int64 
 6   date_joined   206209 non-null  object
 7   n_dependants  206209 non-null  int64 
 8   fam_status    206209 non-null  object
 9   income        206209 non-null  int64 
dtypes: int64(4), object(6)
memory usage: 15.7+ MB


In [71]:
df_customers.describe().apply(lambda x: x.apply('{0:.5f}'.format))

Unnamed: 0,user_id,Age,n_dependants,income
count,206209.0,206209.0,206209.0,206209.0
mean,103105.0,49.50165,1.49982,94632.85255
std,59527.55517,18.48096,1.11843,42473.78699
min,1.0,18.0,0.0,25903.0
25%,51553.0,33.0,0.0,59874.0
50%,103105.0,49.0,1.0,93547.0
75%,154657.0,66.0,3.0,124244.0
max,206209.0,81.0,3.0,593901.0


In [72]:
# Number of Rows and Columns
df_customers.shape

(206209, 10)

## 4. Data Wrangling

In [73]:
df_customers

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374
...,...,...,...,...,...,...,...,...,...,...
206204,168073,Lisa,Case,Female,North Carolina,44,4/1/2020,1,married,148828
206205,49635,Jeremy,Robbins,Male,Hawaii,62,4/1/2020,3,married,168639
206206,135902,Doris,Richmond,Female,Missouri,66,4/1/2020,2,married,53374
206207,81095,Rose,Rollins,Female,California,27,4/1/2020,1,married,99799


In [74]:
# Renaming the columns in a way that we had them in "ords_prods_merge"
df_customers.rename(columns = {'First Name' : 'first_name', 'Surnam' : 'last_name', 'Gender' : 'gender', 'STATE' : 'state', 'Age' : 'age', 'n_dependants' : 'number_of_dependants', 'fam_status' : 'family_status'}, inplace = True)

In [75]:
df_customers

Unnamed: 0,user_id,first_name,last_name,gender,state,age,date_joined,number_of_dependants,family_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374
...,...,...,...,...,...,...,...,...,...,...
206204,168073,Lisa,Case,Female,North Carolina,44,4/1/2020,1,married,148828
206205,49635,Jeremy,Robbins,Male,Hawaii,62,4/1/2020,3,married,168639
206206,135902,Doris,Richmond,Female,Missouri,66,4/1/2020,2,married,53374
206207,81095,Rose,Rollins,Female,California,27,4/1/2020,1,married,99799


In [76]:
df_customers.dtypes

user_id                  int64
first_name              object
last_name               object
gender                  object
state                   object
age                      int64
date_joined             object
number_of_dependants     int64
family_status           object
income                   int64
dtype: object

In [77]:
# Change Data Type of user_id
df_customers['user_id'] = df_customers['user_id'].astype('int64')

In [78]:
df_customers['user_id'].dtype

dtype('int64')

In [79]:
# changing the data type for 'date joined' to datetime
df_customers['date_joined'] = pd.to_datetime(df_customers['date_joined'])

## 5. Data Consistency Checks

In [80]:
# Checking for Missing Values
df_customers.isnull().sum()

user_id                     0
first_name              11259
last_name                   0
gender                      0
state                       0
age                         0
date_joined                 0
number_of_dependants        0
family_status               0
income                      0
dtype: int64

since we wont be adressing each customers by their names due to privacy we can leave that as it is

In [81]:
# checking for mixed data types 
for col in df_customers.columns.tolist():
  weird = (df_customers[[col]].applymap(type) != df_customers[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_customers[weird]) > 0:
    print (col)

  weird = (df_customers[[col]].applymap(type) != df_customers[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_customers[[col]].applymap(type) != df_customers[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_customers[[col]].applymap(type) != df_customers[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_customers[[col]].applymap(type) != df_customers[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_customers[[col]].applymap(type) != df_customers[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_customers[[col]].applymap(type) != df_customers[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_customers[[col]].applymap(type) != df_customers[[col]].iloc[0].apply(type)).any(axis = 1)


first_name


  weird = (df_customers[[col]].applymap(type) != df_customers[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_customers[[col]].applymap(type) != df_customers[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_customers[[col]].applymap(type) != df_customers[[col]].iloc[0].apply(type)).any(axis = 1)


In [82]:
# changing data type of 'first_name' column
df_customers['first_name'] = df_customers['first_name'].astype('str')

In [83]:
# checking for duplicated rows
df_customers.duplicated().sum()

0

## 6. Combining Data

In [84]:
# importing "ords_prods_merge_derived_aggregated.pkl" dataframe from "Prepared Data" folder
ords_prods_merge=pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_merge_derived_aggregated.pkl'))

In [85]:
# Checking Data Types
ords_prods_merge.dtypes

product_id                  int64
product_name               object
aisle_id                    int64
department_id               int64
prices                    float64
order_id                    int64
user_id                     int64
order_number                int64
orders_day_of_week          int64
order_hour_of_day           int64
days_since_prior_order    float64
add_to_cart_order           int64
reordered                   int64
price_label                object
busiest_day                object
busiest_days               object
busiest_period_of_day      object
max_order                   int64
loyalty_flag               object
average_spend             float64
spender_flag               object
customer_frequency        float64
order frequency flag       object
order_frequency_flag       object
dtype: object

In [86]:
# Checking Data Types for customers.csv
df_customers.dtypes

user_id                          int64
first_name                      object
last_name                       object
gender                          object
state                           object
age                              int64
date_joined             datetime64[ns]
number_of_dependants             int64
family_status                   object
income                           int64
dtype: object

In [87]:
# removing the duplicated column from the ord_prod table
ords_prods_merge.drop(['order frequency flag'], axis = 1, inplace = True)

In [88]:
# merging the two dataframes 'customers' and 'ords_prods_merge'
df_merged = ords_prods_merge.merge(df_customers, on = 'user_id', indicator = True)

In [89]:
# check for full merge of tables
df_merged['_merge'].value_counts()

_merge
both          32404859
left_only            0
right_only           0
Name: count, dtype: int64

In [90]:
# dropping '_merge' column from the merged table
df_merged.drop(['_merge'], axis=1, inplace=True)

In [91]:
df_merged.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,...,order_frequency_flag,first_name,last_name,gender,state,age,date_joined,number_of_dependants,family_status,income
0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,28,6,11,...,Frequent customer,Charles,Cox,Male,Minnesota,81,2019-08-01,1,married,49620
1,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,30,6,17,...,Frequent customer,Charles,Cox,Male,Minnesota,81,2019-08-01,1,married,49620
2,907,Premium Sliced Bacon,106,12,20.0,3160996,138,1,5,13,...,Frequent customer,Charles,Cox,Male,Minnesota,81,2019-08-01,1,married,49620
3,907,Premium Sliced Bacon,106,12,20.0,2254091,138,10,5,14,...,Frequent customer,Charles,Cox,Male,Minnesota,81,2019-08-01,1,married,49620
4,1000,Apricots,18,10,12.9,505689,138,9,6,12,...,Frequent customer,Charles,Cox,Male,Minnesota,81,2019-08-01,1,married,49620


In [92]:
# changing data type for 'gender' to category
df_merged['gender'] = df_merged['gender'].astype('category')

In [93]:
# changing data type for 'family_status' to category
df_merged['family_status'] = df_merged['family_status'].astype('category')

In [94]:
# Exporting 'df_merged' into 'Prepared Data' folder as pickle file
df_merged.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'all_merged.pkl'))