In [46]:
import pandas as pd
import os

# Path to the project folder
project_path = r'C:\Users\sudee\OneDrive\Documents\Python Scripts\Instacart Basket Analysis'

# Paths to the data files
orders_checked_path = os.path.join(project_path, '02 Data', 'Prepared Data', 'orders_checked_clean.csv')
products_checked_path = os.path.join(project_path, '02 Data', 'Prepared Data', 'products_checked_clean.csv')
orders_products_prior_path = os.path.join(project_path, '02 Data', 'Original Data', 'orders_products_prior.csv')

# Function to check if the file exists
def check_file_exists(filepath):
    print(f"Checking if {filepath} exists...")
    if not os.path.exists(filepath):
        raise FileNotFoundError(f'{filepath} not found. Please check the file path or previous export steps.')
    print(f"{filepath} exists.")

# Check if the necessary files exist
check_file_exists(orders_checked_path)
check_file_exists(products_checked_path)
check_file_exists(orders_products_prior_path)


Checking if C:\Users\sudee\OneDrive\Documents\Python Scripts\Instacart Basket Analysis\02 Data\Prepared Data\orders_checked_clean.csv exists...
C:\Users\sudee\OneDrive\Documents\Python Scripts\Instacart Basket Analysis\02 Data\Prepared Data\orders_checked_clean.csv exists.
Checking if C:\Users\sudee\OneDrive\Documents\Python Scripts\Instacart Basket Analysis\02 Data\Prepared Data\products_checked_clean.csv exists...
C:\Users\sudee\OneDrive\Documents\Python Scripts\Instacart Basket Analysis\02 Data\Prepared Data\products_checked_clean.csv exists.
Checking if C:\Users\sudee\OneDrive\Documents\Python Scripts\Instacart Basket Analysis\02 Data\Original Data\orders_products_prior.csv exists...
C:\Users\sudee\OneDrive\Documents\Python Scripts\Instacart Basket Analysis\02 Data\Original Data\orders_products_prior.csv exists.


In [48]:
# Load the datasets
df_orders = pd.read_csv(orders_checked_path)
df_products = pd.read_csv(products_checked_path)
df_orders_prior = pd.read_csv(orders_products_prior_path)

# Check the dimensions of the imported dataframes
print("Shape of df_orders:", df_orders.shape)
print("Shape of df_products:", df_products.shape)
print("Shape of df_orders_prior:", df_orders_prior.shape)


Shape of df_orders: (3421083, 6)
Shape of df_products: (49688, 5)
Shape of df_orders_prior: (32434489, 4)


In [50]:
# Merge the orders data with the orders products prior data
df_merged_large = df_orders.merge(df_orders_prior, on='order_id', indicator=True)

# Check for full match using the merge flag
merge_counts = df_merged_large['_merge'].value_counts()
print("Merge flag counts:")
print(merge_counts)

# Remove the '_merge' column after confirming the merge is as expected
df_merged_large = df_merged_large.drop(columns=['_merge'])

# Check the shape of the merged dataframe
print("Shape of the merged dataframe:", df_merged_large.shape)


Merge flag counts:
_merge
both          32434489
left_only            0
right_only           0
Name: count, dtype: int64
Shape of the merged dataframe: (32434489, 9)


In [52]:
# Path to save the merged data
merged_pickle_path = os.path.join(project_path, '02 Data', 'Prepared Data', 'orders_products_combined.pkl')

# Export the merged dataframe to a pickle file
df_merged_large.to_pickle(merged_pickle_path)

print(f"Merged data saved to {merged_pickle_path}")


Merged data saved to C:\Users\sudee\OneDrive\Documents\Python Scripts\Instacart Basket Analysis\02 Data\Prepared Data\orders_products_combined.pkl


In [54]:
# Import the merged dataframe from the pickle file
df_orders_products_combined = pd.read_pickle(merged_pickle_path)

# Check the shape of the imported dataframe
print("Shape of the imported dataframe (orders_products_combined):", df_orders_products_combined.shape)


Shape of the imported dataframe (orders_products_combined): (32434489, 9)


In [56]:
# Merge the combined orders and products data
df_final_merged = df_orders_products_combined.merge(df_products, on='product_id', indicator=True)

# Check for full match using the merge flag
final_merge_counts = df_final_merged['_merge'].value_counts()
print("Final merge flag counts:")
print(final_merge_counts)

# Remove the '_merge' column after confirming the merge is as expected
df_final_merged = df_final_merged.drop(columns=['_merge'])

# Check the shape of the final merged dataframe
print("Shape of the final merged dataframe:", df_final_merged.shape)


Final merge flag counts:
_merge
both          32433030
left_only            0
right_only           0
Name: count, dtype: int64
Shape of the final merged dataframe: (32433030, 13)


In [61]:
# Path to save the final merged data
final_merged_pickle_path = os.path.join(project_path, '02 Data', 'Prepared Data', 'ords_prods_merge.pkl')

# Export the final merged dataframe to a pickle file
df_final_merged.to_pickle(final_merged_pickle_path)

print(f"Final merged data saved to {final_merged_pickle_path}")


Final merged data saved to C:\Users\sudee\OneDrive\Documents\Python Scripts\Instacart Basket Analysis\02 Data\Prepared Data\ords_prods_merge.pkl


# Task 4.6: Combining & Exporting Data

## Step 1: Import Libraries and Check File Existence
This step includes importing necessary libraries and checking the existence of required data files.

## Step 2: Import the Datasets
In this step, we import the datasets and check their dimensions to ensure they are loaded correctly.

## Step 3: Merge `df_orders` and `df_orders_prior`
We merge the `df_orders` and `df_orders_prior` dataframes on the `order_id` column and check for full matches.

## Step 4: Export the Merged Data in Pickle Format
Export the merged data to a pickle file for efficient storage and retrieval.

## Step 5: Import the Merged Data from Pickle Format
Import the previously saved pickle file to verify its integrity and correctness.

## Step 6: Merge with Products Data
Merge the combined orders and products data on the `product_id` column and check for full matches.

## Step 7: Export the Final Merged Data
Export the final merged dataframe to a pickle file for efficient storage and retrieval.
