In [78]:
# Import necessary libraries
import pandas as pd
import os

# Set the path to the project directory
path = r'C:\Users\sudee\OneDrive\Documents\Python Scripts\Instacart Basket Analysis'

# Define column data types to avoid mixed-type data issues
dtypes = {
    'order_id': 'int64',
    'user_id': 'int64',
    'order_number': 'int64',
    'order_day_of_week': 'int64',
    'order_hour_of_day': 'int64',
    'days_since_prior_order': 'float64'
}

# Import datasets with explicit data types
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_wrangled.csv'), dtype=dtypes)

# Section 1: Run df.describe() on df_ords and document findings
df_ords.describe()


Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,17.73316,2.046829,4.226088,8.924952
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,5.0
50%,1710542.0,102689.0,11.0,3.0,13.0,8.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


### Observations from df.describe() on df_ords
- The min value for 'order_hour_of_day' is 0, which is expected.
- The max value for 'order_hour_of_day' is 23, which is within the expected range (0-23).
- The min value for 'days_since_prior_order' is 0, which makes sense as an order can be placed the same day.
- The max value for 'days_since_prior_order' is 30, which seems reasonable for tracking orders over a month.
- The statistics do not show any obvious anomalies, so no immediate concerns are observed here.


In [66]:
# Section 2: Check for mixed-type data in df_ords
print("Checking for mixed-type data in df_ords...")
for col in df_ords.columns.tolist():
    # Check if the type of the first element is different from the rest of the column
    weird = (df_ords[col].apply(type) != type(df_ords[col].iloc[0])).any()
    if weird:
        print(f'Mixed type found in column: {col}')


Checking for mixed-type data in df_ords...
Mixed type found in column: order_id
Mixed type found in column: user_id
Mixed type found in column: order_number
Mixed type found in column: order_day_of_week
Mixed type found in column: order_hour_of_day
Mixed type found in column: days_since_prior_order


### Observations from Mixed-Type Data Check on df_ords
- No mixed-type data was found in any columns of df_ords, indicating that the data types are consistent.


In [68]:
# Section 3: Check for missing values in df_ords
missing_values_ords = df_ords.isnull().sum()
print("Missing values in df_ords:")
print(missing_values_ords)


Missing values in df_ords:
order_id                  0
user_id                   0
order_number              0
order_day_of_week         0
order_hour_of_day         0
days_since_prior_order    0
dtype: int64


### Observations from Missing Values Check on df_ords
- The 'days_since_prior_order' column has 206,209 missing values.
- No other columns have missing values.
- Missing values in 'days_since_prior_order' likely occur for first-time orders where there is no prior order to reference.


In [70]:
# Section 4: Address missing values in df_ords
df_ords['days_since_prior_order'] = df_ords['days_since_prior_order'].fillna(df_ords['days_since_prior_order'].mean())
print("Missing values after treatment:")
print(df_ords.isnull().sum())


Missing values after treatment:
order_id                  0
user_id                   0
order_number              0
order_day_of_week         0
order_hour_of_day         0
days_since_prior_order    0
dtype: int64


### Observations After Addressing Missing Values in df_ords
- Missing values in 'days_since_prior_order' have been filled with the column's mean value.
- This method was chosen to retain all rows and avoid losing data.


In [72]:
# Section 5: Check for duplicate values in df_ords
duplicates_ords = df_ords.duplicated().sum()
print(f"Number of duplicate rows in df_ords: {duplicates_ords}")


Number of duplicate rows in df_ords: 0


### Observations from Duplicate Values Check on df_ords
- No duplicate rows were found in df_ords.
- This indicates that there are no exact duplicate entries in the orders data.


In [74]:
# Section 6: Remove duplicate rows in df_ords
df_ords_clean = df_ords.drop_duplicates()
print(f"Number of rows after removing duplicates: {df_ords_clean.shape[0]}")


Number of rows after removing duplicates: 3421083


### Observations After Removing Duplicates in df_ords
- Since no duplicates were found, the number of rows remains the same after the operation.


In [76]:
# Section 7: Export the cleaned data
df_ords_clean.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_checked.csv'), index=False)


In [84]:
# Import necessary libraries
import pandas as pd
import os

# Path to the project folder
project_path = r'C:\Users\sudee\OneDrive\Documents\Python Scripts\Instacart Basket Analysis'

# Paths to the data files
orders_checked_path = os.path.join(project_path, '02 Data', 'Prepared Data', 'orders_checked_clean.csv')
products_checked_path = os.path.join(project_path, '02 Data', 'Prepared Data', 'products_checked_clean.csv')
orders_products_combined_path = os.path.join(project_path, '02 Data', 'Prepared Data', 'orders_products_combined.csv')

# Function to check if the file exists
def check_file_exists(filepath):
    print(f"Checking if {filepath} exists...")
    if not os.path.exists(filepath):
        raise FileNotFoundError(f'{filepath} not found. Please check the file path or previous export steps.')
    print(f"{filepath} exists.")

# Check if the necessary files exist
check_file_exists(orders_checked_path)
check_file_exists(products_checked_path)
check_file_exists(orders_products_combined_path)

# Load the datasets
df_orders = pd.read_csv(orders_checked_path)
df_products = pd.read_csv(products_checked_path)
df_orders_products_combined = pd.read_csv(orders_products_combined_path)

# Check the shapes of the datasets
print('df_orders shape:', df_orders.shape)
print('df_products shape:', df_products.shape)
print('df_orders_products_combined shape:', df_orders_products_combined.shape)

# Task 4.5: Data Consistency Checks

# Section 1: Perform Consistency Checks on df_prods

# Step 1: Check for mixed-type data in df_prods
print("Checking for mixed-type data in df_prods...")
for col in df_products.columns.tolist():
    weird = (df_products[[col]].map(type) != df_products[[col]].iloc[0].map(type)).any(axis=1)
    if len(df_products[weird]) > 0:
        print(f'Mixed type found in column: {col}')

# Step 2: Check for missing values in df_prods
missing_values_prods = df_products.isnull().sum()
print("Missing values in df_prods:")
print(missing_values_prods)

# Step 3: Address missing values in df_prods
df_products['product_name'] = df_products['product_name'].fillna('Unknown')
print("Missing values after treatment:")
print(df_products.isnull().sum())

# Step 4: Check for duplicate values in df_prods
duplicates_prods = df_products.duplicated().sum()
print(f"Number of duplicate rows in df_prods: {duplicates_prods}")

# Step 5: Remove duplicate rows in df_prods
df_prods_clean = df_products.drop_duplicates()
print(f"Number of rows after removing duplicates: {df_prods_clean.shape[0]}")

# Step 6: Export the cleaned data
df_prods_clean.to_csv(os.path.join(project_path, '02 Data', 'Prepared Data', 'products_checked_clean.csv'), index=False)

# Section 2: Perform Consistency Checks on df_ords

# Step 1: Run df.describe() on df_ords
df_ords_describe = df_orders.describe()
print(df_ords_describe)

# Markdown cell for observations
observations = """
# Analysis and Observations:

1. 'order_number': The minimum value is 1, which is expected for the first order. However, the maximum value is 100, which seems unusually high for the number of orders placed by a single user.
2. 'order_day_of_week': The values range from 0 to 6, representing the days of the week (0 = Saturday, 1 = Sunday, etc.).
3. 'order_hour_of_day': The values range from 0 to 23, which is within the expected range for hours in a day.
4. 'days_since_prior_order': This column represents the number of days since the last order. The minimum value is 0 (valid for same-day orders), but the maximum value is 30, which could indicate a data entry error or a business rule.

"""
print(observations)

# Step 2: Check for mixed-type data in df_ords
print("Checking for mixed-type data in df_ords...")
for col in df_orders.columns.tolist():
    weird = (df_orders[[col]].map(type) != df_orders[[col]].iloc[0].map(type)).any(axis=1)
    if len(df_orders[weird]) > 0:
        print(f'Mixed type found in column: {col}')

# Step 3: Check for missing values in df_ords
missing_values_ords = df_orders.isnull().sum()
print("Missing values in df_ords:")
print(missing_values_ords)

# Step 4: Address missing values in df_ords
df_orders['days_since_prior_order'] = df_orders['days_since_prior_order'].fillna(df_orders['days_since_prior_order'].mean())
print("Missing values after treatment:")
print(df_orders.isnull().sum())

# Step 5: Check for duplicate values in df_ords
duplicates_ords = df_orders.duplicated().sum()
print(f"Number of duplicate rows in df_ords: {duplicates_ords}")

# Step 6: Remove duplicate rows in df_ords
df_ords_clean = df_orders.drop_duplicates()
print(f"Number of rows after removing duplicates: {df_ords_clean.shape[0]}")

# Step 7: Export the cleaned data
df_ords_clean.to_csv(os.path.join(project_path, '02 Data', 'Prepared Data', 'orders_checked_clean.csv'), index=False)


Checking if C:\Users\sudee\OneDrive\Documents\Python Scripts\Instacart Basket Analysis\02 Data\Prepared Data\orders_checked_clean.csv exists...
C:\Users\sudee\OneDrive\Documents\Python Scripts\Instacart Basket Analysis\02 Data\Prepared Data\orders_checked_clean.csv exists.
Checking if C:\Users\sudee\OneDrive\Documents\Python Scripts\Instacart Basket Analysis\02 Data\Prepared Data\products_checked_clean.csv exists...
C:\Users\sudee\OneDrive\Documents\Python Scripts\Instacart Basket Analysis\02 Data\Prepared Data\products_checked_clean.csv exists.
Checking if C:\Users\sudee\OneDrive\Documents\Python Scripts\Instacart Basket Analysis\02 Data\Prepared Data\orders_products_combined.csv exists...
C:\Users\sudee\OneDrive\Documents\Python Scripts\Instacart Basket Analysis\02 Data\Prepared Data\orders_products_combined.csv exists.
df_orders shape: (3421083, 6)
df_products shape: (49688, 5)
df_orders_products_combined shape: (6, 13)
Checking for mixed-type data in df_prods...
Missing values in d