### Exercise 01. Concatenating Data

In [2]:
# Import libraries
import pandas as pd
import numpy as np
import os

In [None]:
# Concatenating is good for combining data sets that have multple rows ands columns of the same length
# A common use of conctenation is combining two or more data sets that share the same characteristics but refer to different time periods

In [3]:
# Define a dictionary containing January 2020 data
data1 = {'customer_id':['6732', '767', '890', '635'],
    'month':['Jan-20', 'Jan-20', 'Jan-20', 'Jan-20'],
    'purchased_meat':[0, 13, 3, 4],
    'purchased_alcohol':[1, 2, 10, 0],
    'purchased_snacks':[10, 5, 1, 7]}

In [4]:
# Define a dictionary containing February 2020 data
data2 = {'customer_id':['6732', '767', '890', '635'],
    'month':['Feb-20', 'Feb-20', 'Feb-20', 'Feb-20'],
    'purchased_meat':[0, 10, 5, 3],
    'purchased_alcohol':[2, 4, 14, 0],
    'purchased_snacks':[15, 3, 2, 6]}

In [7]:
# Convert the dictionary into the dataframe
df = pd.DataFrame(data1,index=[0, 1, 2, 3])
df_1 = pd.DataFrame(data2,index=[0, 1 ,2, 3])

In [8]:
df

Unnamed: 0,customer_id,month,purchased_meat,purchased_alcohol,purchased_snacks
0,6732,Jan-20,0,1,10
1,767,Jan-20,13,2,5
2,890,Jan-20,3,10,1
3,635,Jan-20,4,0,7


In [9]:
df_1

Unnamed: 0,customer_id,month,purchased_meat,purchased_alcohol,purchased_snacks
0,6732,Feb-20,0,2,15
1,767,Feb-20,10,4,3
2,890,Feb-20,5,14,2
3,635,Feb-20,3,0,6


In [None]:
# Now that the data dictionaries have been converted to dataframes, they can be concatenated
# To do so, create a list containing the dataframes you want to combine 
# Then using the pandas function 'pd.concat()' saving the results in a new dataframe, 'df_concat'

In [10]:
frames = [df, df_1]

In [11]:
df_concat = pd.concat(frames)

In [12]:
df_concat

Unnamed: 0,customer_id,month,purchased_meat,purchased_alcohol,purchased_snacks
0,6732,Jan-20,0,1,10
1,767,Jan-20,13,2,5
2,890,Jan-20,3,10,1
3,635,Jan-20,4,0,7
0,6732,Feb-20,0,2,15
1,767,Feb-20,10,4,3
2,890,Feb-20,5,14,2
3,635,Feb-20,3,0,6


In [None]:
# The default setting for pd.concat() is to stack any new rows after any existsing ones creating a long-format dataframe (axis = 0)
# However, if you want a w-deformat dataframe it must be specified using the argument 'axis = 1'

In [14]:
df_concat = pd.concat(frames, axis = 1)

In [15]:
df_concat

Unnamed: 0,customer_id,month,purchased_meat,purchased_alcohol,purchased_snacks,customer_id.1,month.1,purchased_meat.1,purchased_alcohol.1,purchased_snacks.1
0,6732,Jan-20,0,1,10,6732,Feb-20,0,2,15
1,767,Jan-20,13,2,5,767,Feb-20,10,4,3
2,890,Jan-20,3,10,1,890,Feb-20,5,14,2
3,635,Jan-20,4,0,7,635,Feb-20,3,0,6


In [None]:
# To recap, the pd.concat() function is suitable for rows/columns of the same length, places dataframes on top of each other, and requires a list

### Exercise 02. Joining Data

In [None]:
# Combining data using the 'df.join()' function is very similar to 'df.merge()'
# The df.join function is typically only used when the index column carries some sort of information instead of just the number of rows 

### Exercise 03. Merging Data

In [18]:
# The df.merge() function is very versatile and has many options for producing different outcomes
# The best use cases for it are where dataframes you want to combine don't match in shape
# In these cases, you'll need a key or some sort of common identifier column that bring these data sets together.
# There are different ways to join mismatched data sets: inner join, left join, right join, and full outer join
# A 'full match' refers to having 100 percent of both dataframes in the new combined dataframe
# Inner join: Used to keep only information that's present in both data sets
# Left join: Used to keep information from the left dataframe, combining it with any information that can be mapped back to the dataframe on the left
# Right join: Used to keep information from the right dataframe, combining it with any information that can be mapped back to the dataframe on the right
# Full Outer Join: Used to keep all information from both dataframes, regardless of whether they match
# The df.merge() function comes with another importnat criterion for how data will be combined: the 'on' argument
# The 'on' argument designates a common identifier column on which to merge the data

In [19]:
data3 = {'customer_id':['6732', '767', '890', '635'],
    'month':['Jan-20', 'Jan-20', 'Jan-20', 'Jan-20'],
    'days_purchased_on':[0, 10, 4, 1]}

In [20]:
df_2 = pd.DataFrame(data3,index=[0, 1, 2, 3])

In [None]:
# Now we have two dataframes of different sizes: df and df_2

In [21]:
df_merged = df.merge(df_2, on = 'customer_id')

In [22]:
df_merged

Unnamed: 0,customer_id,month_x,purchased_meat,purchased_alcohol,purchased_snacks,month_y,days_purchased_on
0,6732,Jan-20,0,1,10,Jan-20,0
1,767,Jan-20,13,2,5,Jan-20,10
2,890,Jan-20,3,10,1,Jan-20,4
3,635,Jan-20,4,0,7,Jan-20,1


In [None]:
# The df.merge() function takes one function and combines it with another dataframes, which you include within the parenthesis
# The 'on = customer_id' tells pandas that this the common identifier column between the two
# The result is a wider-format dataframe with all information from both dataframes and no missing values
# There's two month columns because it was not selected as the common identifier and month exists in both dataframes

In [23]:
df_merged = df.merge(df_2, on = ['customer_id', 'month'])

In [24]:
df_merged

Unnamed: 0,customer_id,month,purchased_meat,purchased_alcohol,purchased_snacks,days_purchased_on
0,6732,Jan-20,0,1,10,0
1,767,Jan-20,13,2,5,10
2,890,Jan-20,3,10,1,4
3,635,Jan-20,4,0,7,1


In [None]:
# In this case we have two common identifier columns, customer_id and month

In [None]:
# An easy way to check for a full match between dataframes is via the 'indicator = True' argument
# This argument creates a new column that reports on the specifics of the merge 
# A column will be generated with a value that indicates the source of the data in that row
# A value of both means the keys (or keys) you specified exist in both dataframes, while a value of left_only or right_only shows it exists left/right

In [25]:
df_merged = df.merge(df_2, on = ['customer_id', 'month'], indicator = True)

In [26]:
df_merged

Unnamed: 0,customer_id,month,purchased_meat,purchased_alcohol,purchased_snacks,days_purchased_on,_merge
0,6732,Jan-20,0,1,10,0,both
1,767,Jan-20,13,2,5,10,both
2,890,Jan-20,3,10,1,4,both
3,635,Jan-20,4,0,7,1,both


In [30]:
df_merged['_merge'].value_counts()

_merge
both          4
left_only     0
right_only    0
Name: count, dtype: int64

In [None]:
# All rows in the dataframe returned a value of 'both' so there's a full match

In [32]:
# Remember to test procedures like these so data isn't overwritten that might be needed later
# If you want to test a merge without saving it to a new dataframe, you can do so with the following code
pd.merge(df,df_2, on = ['customer_id', 'month'], indicator = True)

Unnamed: 0,customer_id,month,purchased_meat,purchased_alcohol,purchased_snacks,days_purchased_on,_merge
0,6732,Jan-20,0,1,10,0,both
1,767,Jan-20,13,2,5,10,both
2,890,Jan-20,3,10,1,4,both
3,635,Jan-20,4,0,7,1,both


In [None]:
# Since you're not applying the merge to an existing dataframe, both dataframes need to be included as arguments within the pd.merge()

In [33]:
# There's another argument to be aware of when using the df.merge() function: the 'how' argument
# This argument specifies how you want your dataframes to be merged, and it takes the values left, right, inner (default), or outer
# These values specify the type of join you wish to use
# For example: df.merge(df_2, on = ['customer_id', 'month'], how = 'inner') 

### Exercise 04. Merging Your Instacart Data

In [None]:
# Our current df_ords and df_prods dataframes don't contain a common column
# To solve this problem, we're going to combine our df_ords dataframe with a new dataframe called order_products_prior
# This dataframe contains a 'products_id' column (the same as the df_prods dataframe) 
# By adding this to the df_ords dataframe we'll have created a common column between df_ords and df_prods

In [42]:
# Set path
path = r'C:\Users\lance\Documents\Achievement 4 Project'

In [49]:
# Import and define dataframes ('orders_cleaned.csv' and 'orders_products_prior.csv')
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_cleaned.csv'), index_col = False)
df_ords_prior = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_prior.csv'), index_col = False)

In [52]:
df_ords.drop(columns=['Unnamed: 0'], inplace =True)

In [53]:
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,first_order
0,2539329,1,1,2,8,,True
1,2398795,1,2,3,7,15.0,False
2,473747,1,3,3,12,21.0,False
3,2254736,1,4,4,7,29.0,False
4,431534,1,5,4,15,28.0,False


In [54]:
df_ords_prior.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [None]:
# Check the shapes of both dataframes

In [55]:
df_ords.shape

(3421083, 7)

In [56]:
df_ords_prior.shape

(32434489, 4)

In [None]:
# Merging these two large data sets should be easy, in theory, because of their shared column 'order_id'
# As such, you should have a full matching 'order_id' column so there's no need to specify a type of join as the default (inner) will suffice

In [57]:
df_merged_large = df_ords.merge(df_ords_prior, on = 'order_id', indicator = True)

In [58]:
# This new dataframe combines both the df_ords and df_ords_prior dataframes and uses the 'order_id' column as its key 
# It also includes the indicator = True argument so that we can check for a full match

In [67]:
df_merged_large.shape

(32434489, 11)

In [68]:
df_merged_large.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,_merge
0,2539329,1,1,2,8,,True,196,1,0,both
1,2539329,1,1,2,8,,True,14084,2,0,both
2,2539329,1,1,2,8,,True,12427,3,0,both
3,2539329,1,1,2,8,,True,26088,4,0,both
4,2539329,1,1,2,8,,True,26405,5,0,both


In [60]:
df_merged_large['_merge'].value_counts()

_merge
both          32434489
left_only            0
right_only           0
Name: count, dtype: int64

In [66]:
# The output shows all rows containing the 'both' value would confirm the full match, however this is wrong
# What pandas does here is fill in information about each product for every 'order_id' in the df_ords dataframe
# This is why the resulting dataframe has 32,434,489 rows (the same total as the df_ords_prior dataframe)
# However, this doesn't mean there's a full match
# In this case, we chose the default option of inner join, meaning the resulting table will only contain observations found in both dataframes
# As such, the merge flag here will only show entries that have a value of 'both'
# Merging using an outer join will combine all the observation and show you the real merge rate
# Always double-check your merge rates using the outer join

In [72]:
# Outer join test to check merge rate
df_merged_large_outer = df_ords.merge(df_ords_prior, on = 'order_id', how='outer', indicator = True)

In [73]:
df_merged_large_outer['_merge'].value_counts()

_merge
both          32434489
left_only       206209
right_only           0
Name: count, dtype: int64

In [77]:
# The merged frequency check (32,434,489) matches the merged data row count (32,434,489), however there are 206,209 left_join matches
# These results show this isn't a full match
# Check the values that are left_only
df_left_only = df_merged_large_outer[df_merged_large_outer['_merge'] == 'left_only']

In [78]:
df_left_only

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,_merge
59,1187899,1,11,4,8,14.0,False,,,,left_only
255,1492625,2,15,1,11,30.0,False,,,,left_only
344,2774568,3,13,5,15,11.0,False,,,,left_only
363,329954,4,6,3,12,30.0,False,,,,left_only
401,2196797,5,5,0,11,6.0,False,,,,left_only
...,...,...,...,...,...,...,...,...,...,...,...
32639379,1716008,206205,4,1,16,10.0,False,,,,left_only
32639665,1043943,206206,68,0,20,0.0,False,,,,left_only
32639889,2821651,206207,17,2,13,14.0,False,,,,left_only
32640567,803273,206208,50,5,11,4.0,False,,,,left_only


In [83]:
# According to the results the 'product_id', 'add_to_cart_order', and 'reordered' columns have 206,209 'NaN' values
# Columns like 'reordered' having NaN values make sense because some products haven't been reordered
# However, the 'product_id' values missing is questionable 
df_left_only.isnull().sum()

order_id                       0
user_id                        0
order_number                   0
orders_day_of_week             0
order_hour_of_day              0
days_since_prior_order         0
first_order                    0
product_id                206209
add_to_cart_order         206209
reordered                 206209
_merge                         0
dtype: int64

### Exercise 05. Exporting Data in Pickle Format

In [None]:
# Advantages of exporting as a .csv
# Can be opened in multiple tools and programs, can be customized to inclue certain columns or rows when imported, high compression ratew when zipped
# Disadvantages of exporting as a .csv
# Takes more time to import and export when data sets are large, can lead to index column issues when exporting and reimporting

In [None]:
# Advantages of exporting as a .pkl
# Can be import and exported quickly, saves dataframes exactly as they look in Jupyter, have a high compression rate when zipped
# Disadvantages of exporting as a .pkl
# Are only accessible to python users
# Can't be customized to include certain columns or rows when imported

In [64]:
# The biggest difference when it comes to import and exporting .csv files and .pkl files is efficiency
# The exporting syntax for both is very similar
# .csv: df_merged_large.to_csv(os.path.join(path, '02 Data','Prepared Data', 'orders_products_combined.csv'))
# .pkl: df_merged_large.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_products_combined.pkl'))

In [None]:
# Importing .pkl files doesn't require a "index_co" function because they include this information already

### Task 02. Export the merged file in pickle format as 'orders_products_combined.pkl'

In [65]:
df_merged_large.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_products_combined.pkl'))