In [7]:
# Import necessary libraries
import pandas as pd
import os

# Path to the project folder
project_path = r'C:\Users\sudee\OneDrive\Documents\Python Scripts\Instacart Basket Analysis'

# Paths to the data files
orders_checked_path = os.path.join(project_path, '02 Data', 'Prepared Data', 'orders_checked_clean.csv')
orders_products_prior_path = os.path.join(project_path, '02 Data', 'Original Data', 'orders_products_prior.csv')
products_checked_path = os.path.join(project_path, '02 Data', 'Prepared Data', 'products_checked_clean.csv')

# Function to check if the file exists
def check_file_exists(filepath):
    print(f"Checking if {filepath} exists...")
    if not os.path.exists(filepath):
        raise FileNotFoundError(f'{filepath} not found. Please check the file path or previous export steps.')
    print(f"{filepath} exists.")

# Check if the necessary files exist
check_file_exists(orders_checked_path)
check_file_exists(orders_products_prior_path)
check_file_exists(products_checked_path)

# Load the datasets
df_orders = pd.read_csv(orders_checked_path)
df_orders_prior = pd.read_csv(orders_products_prior_path)
df_products = pd.read_csv(products_checked_path)

# Check the shapes of the datasets
print('df_orders shape:', df_orders.shape)
print('df_orders_prior shape:', df_orders_prior.shape)
print('df_products shape:', df_products.shape)


Checking if C:\Users\sudee\OneDrive\Documents\Python Scripts\Instacart Basket Analysis\02 Data\Prepared Data\orders_checked_clean.csv exists...
C:\Users\sudee\OneDrive\Documents\Python Scripts\Instacart Basket Analysis\02 Data\Prepared Data\orders_checked_clean.csv exists.
Checking if C:\Users\sudee\OneDrive\Documents\Python Scripts\Instacart Basket Analysis\02 Data\Original Data\orders_products_prior.csv exists...
C:\Users\sudee\OneDrive\Documents\Python Scripts\Instacart Basket Analysis\02 Data\Original Data\orders_products_prior.csv exists.
Checking if C:\Users\sudee\OneDrive\Documents\Python Scripts\Instacart Basket Analysis\02 Data\Prepared Data\products_checked_clean.csv exists...
C:\Users\sudee\OneDrive\Documents\Python Scripts\Instacart Basket Analysis\02 Data\Prepared Data\products_checked_clean.csv exists.
df_orders shape: (3421083, 6)
df_orders_prior shape: (32434489, 4)
df_products shape: (49688, 5)


## Step 2: Merge the Orders Data with Orders Products Prior Data

We merge the orders data with the orders products prior data.


In [9]:
# Merge the orders data with the orders products prior data
df_merged_large = df_orders.merge(df_orders_prior, on='order_id', indicator=True)

# Check for full match using the merge flag
merge_counts = df_merged_large['_merge'].value_counts()
print("Merge flag counts:")
print(merge_counts)

# Remove the '_merge' column after confirming the merge is as expected
df_merged_large = df_merged_large.drop(columns=['_merge'])

# Check the shape of the merged dataframe
print("Shape of the merged dataframe:", df_merged_large.shape)


Merge flag counts:
_merge
both          32434489
left_only            0
right_only           0
Name: count, dtype: int64
Shape of the merged dataframe: (32434489, 9)


## Step 3: Export the Merged Data as a Pickle File

We export the merged data as a pickle file.


In [10]:
# Path to save the merged data
merged_data_path = os.path.join(project_path, '02 Data', 'Prepared Data', 'orders_products_combined.pkl')

# Export the merged data as a pickle file
df_merged_large.to_pickle(merged_data_path)
print(f"Merged data saved to {merged_data_path}")


Merged data saved to C:\Users\sudee\OneDrive\Documents\Python Scripts\Instacart Basket Analysis\02 Data\Prepared Data\orders_products_combined.pkl


In [12]:
# Import the merged data from the pickle file
df_orders_products_combined = pd.read_pickle(merged_data_path)

# Check the shape of the imported dataframe
print("Shape of the imported dataframe:", df_orders_products_combined.shape)


Shape of the imported dataframe: (32434489, 9)
