## This script contains:

##### Import the pandas, NumPy, and os libraries
##### Import 'df_merged' as pickle
##### Merging 'df_merged' with 'df_prods'
##### Exporting new merged df as pickle

### Import the pandas, NumPy, and os libraries

In [6]:
# Import libraries

import pandas as pd
import numpy as np
import os

### Import 'df_merged' as pickle

In [7]:
# Import data sets using the path as string

path = r'C:\Users\loren\Desktop\Career Foundry\2. Data Immersion Course\A4 Python Fundamentals for Data Analysts\07-2023 Instacart Basket Analysis'

df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'products_checked.csv'), index_col = False)
df_merged = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_combined.pkl'))


In [8]:
# Checking if importation worked

df_prods

Unnamed: 0.1,Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,0,1,Chocolate Sandwich Cookies,61,19,5.8
1,1,2,All-Seasons Salt,104,13,9.3
2,2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,4,5,Green Chile Anytime Sauce,5,13,4.3
...,...,...,...,...,...,...
49667,49688,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5,5.3
49668,49689,49685,En Croute Roast Hazelnut Cranberry,42,1,3.1
49669,49690,49686,Artisan Baguette,112,3,7.8
49670,49691,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8,4.7


In [9]:
# To fix issue on 'df_prods' (remove 'Unnamed: 0' column)

df_prods.drop('Unnamed: 0', axis=1, inplace=True)

In [10]:
# Checking if importation worked

df_merged

# I prefer use 'df_merged' instead of 'df_merged.shape' because I can get the head(5), tail(5) and shape by once

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge
0,2539329,1,1,2,8,0.0,196,1,0,both
1,2539329,1,1,2,8,0.0,14084,2,0,both
2,2539329,1,1,2,8,0.0,12427,3,0,both
3,2539329,1,1,2,8,0.0,26088,4,0,both
4,2539329,1,1,2,8,0.0,26405,5,0,both
...,...,...,...,...,...,...,...,...,...,...
32434484,2977660,206209,13,1,12,7.0,14197,5,1,both
32434485,2977660,206209,13,1,12,7.0,38730,6,0,both
32434486,2977660,206209,13,1,12,7.0,31477,7,0,both
32434487,2977660,206209,13,1,12,7.0,6567,8,0,both


In [11]:
how = 'outer'

### Merging 'df_merged' with 'df_prods'

In [12]:
# Checking 'df_prods' to identify wich column can be used as key

df_prods

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3
...,...,...,...,...,...
49667,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5,5.3
49668,49685,En Croute Roast Hazelnut Cranberry,42,1,3.1
49669,49686,Artisan Baguette,112,3,7.8
49670,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8,4.7


In [13]:
# Droping indicator column to avoid error while I try merge 'df_merged' with indicator = True
# The error says I can´t use the name of an existing column for the indicator column when merging dataframes
# I chose create a subset to does not overwrite

ords_prods_combined = df_merged.drop(columns = ['_merge'])

In [14]:
# Merging the df using 'product_id' variable as key

ords_prods_combined = ords_prods_combined.merge(df_prods, on = 'product_id', indicator = True)

In [15]:
ords_prods_combined.shape

(32404859, 14)

In [16]:
# Checking if worked

ords_prods_combined['_merge'].value_counts()

# The result shows there is a full match between the two df merged

both          32404859
left_only            0
right_only           0
Name: _merge, dtype: int64

In [17]:
ords_prods_combined

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge
0,2539329,1,1,2,8,0.0,196,1,0,Soda,77,7,9.0,both
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,both
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,both
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32404854,1320836,202557,17,2,15,1.0,43553,2,1,Orange Energy Shots,64,7,3.7,both
32404855,31526,202557,18,5,11,3.0,43553,2,1,Orange Energy Shots,64,7,3.7,both
32404856,758936,203436,1,2,7,0.0,42338,4,0,"Zucchini Chips, Pesto",50,19,6.9,both
32404857,2745165,203436,2,3,5,15.0,42338,16,1,"Zucchini Chips, Pesto",50,19,6.9,both


### Exporting new merged df as pickle

In [18]:
ords_prods_combined.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_products_merged.pkl'))