# Section 1: Import Libraries

In [31]:
import pandas as pd  # For data manipulation
import numpy as np  # For numerical operations
import os  # For operating system interactions

# Define project folder path

In [33]:
project_folder_path = r"C:\Users\marci\15-07-2024Instacart Basket Analysis\02 Data"

# Load data


In [35]:
orders_path = os.path.join(project_folder_path, "Original Data", "orders.csv")
df_ords = pd.read_csv(orders_path)

products_path = os.path.join(project_folder_path, "Original Data", "products.csv")
df_prods = pd.read_csv(products_path)

In [13]:
df_ords.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [15]:
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [17]:
df_ords.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


# Running discribe function on df_ords

In [37]:
df_ords.describe()

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


### Data Summary for df_ords

- **order_id**: The `order_id` ranges from 1 to 3,421,083, which makes sense as it is a unique identifier for each order.
- **user_id**: The `user_id` ranges from 1 to 206,209, indicating the number of unique users.
- **order_number**: The `order_number` ranges from 1 to 100, which seems reasonable as it represents the sequence of orders for each user.
- **order_dow**: The `order_dow` ranges from 0 to 6, representing the days of the week (0 = Sunday, 6 = Saturday).
- **order_hour_of_day**: The `order_hour_of_day` ranges from 0 to 23, indicating the hour of the day when orders are placed.
- **days_since_prior_order**: The `days_since_prior_order` ranges from 0 to 30, which is reasonable for days between orders.

# Check for Mixed-Type data in df_ors

In [104]:
# Check for mixed-type data
for column in df_ords.columns:
    mixed_data = df_ords[column].apply(type).nunique() > 1
    if mixed_data:
        print(f"Column {column} has mixed data types.")

 # Check for Mixed-Type data in df_prods

In [105]:
# check for mixed-type data
for column in df_prods.columns:
    mixed_data = df_prods[column].apply(type).nunique() > 1
    if mixed_data:
        print(f"Column {column} has mixed data types.")

In [108]:
# Inspect mixed types in the 'product_name' column
mixed_types = df_prods['product_name'].apply(type).unique()
print("Mixed types in 'product_name':", mixed_types)

Mixed types in 'product_name': [<class 'str'>]


In [46]:
# Display rows with non-string values in 'product_name'
non_string_values = df_prods[~df_prods['product_name'].apply(lambda x: isinstance(x, str))]
print(non_string_values)

       product_id product_name  aisle_id  department_id  prices
33             34          NaN       121             14    12.2
68             69          NaN        26              7    11.8
115           116          NaN        93              3    10.8
261           262          NaN       110             13    12.1
525           525          NaN       109             11     1.2
1511         1511          NaN        84             16    14.3
1780         1780          NaN       126             11    12.3
2240         2240          NaN        52              1    14.2
2586         2586          NaN       104             13    12.4
3159         3159          NaN       126             11    13.1
3230         3230          NaN       120             16    14.4
3736         3736          NaN        41              8    14.8
4283         4283          NaN        77              7    14.4
4790         4790          NaN        91             16    14.5
38187       38183          NaN        39

In [110]:
# #Convert all entries in 'product_name' to string
df_prods['product_name'] = df_prods['product_name'].astype(str)

In [112]:
# #Verify conversion
mixed_types_after_conversion = df_prods['product_name'].apply(type).unique()
print("Types in 'product_name' after conversion:", mixed_types_after_conversion)

Types in 'product_name' after conversion: [<class 'str'>]


In [114]:
# Re-check for mixed-type data in the entire dataframe
for column in df_prods.columns:
    mixed_data = df_prods[column].apply(type).nunique() > 1
    if mixed_data:
        print(f"Column {column} still has mixed data types.")

In [127]:
# Fill NaN values in 'product_name' with 'Unknown'
df_prods['product_name'].fillna('Unknown', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_prods['product_name'].fillna('Unknown', inplace=True)


# Check for Missing Values in df_ords

In [116]:
# Check for missing values
missing_values = df_ords.isnull().sum()
print(missing_values)

order_id                  0
user_id                   0
order_number              0
order_dow                 0
order_hour_of_day         0
days_since_prior_order    0
dtype: int64


# # Check for Missing Values in df_prods

In [118]:
missing_values = df_prods.isnull().sum()
print(missing_values)

product_id       0
product_name     0
aisle_id         0
department_id    0
prices           0
dtype: int64


#  Handling Missing Values

<!-- For days_since_prior_order in df_ords -->
The days_since_prior_order column has 206,209 missing values.These missing values likely indicate the customer's first order, as there is no prior order to compare.Therefore, we can replace missing values with 0 to indicate no prior orders.

<!-- For product_name in df_prods -->
The product_name column is essential for identifying products. Since there are only 16 missing values, it might be appropriate to drop these rows if the missing data is minimal compared to the total dataset size. If product names are crucial for analysis, dropping these rows is often the best approach.

# Handling Missing Values in df_ords

In [55]:
# Replace missing values in 'days_since_prior_order' with 0
df_ords['days_since_prior_order'] = df_ords['days_since_prior_order'].fillna(0)

# Verify no missing values remain
print(df_ords['days_since_prior_order'].isnull().sum())


0


#  Handling Missing Values in df_prods

In [121]:
# Drop rows with missing 'product_name'
df_prods = df_prods.dropna(subset=['product_name'])

# Verify no missing values remain
print(df_prods['product_name'].isnull().sum())

0


# Check for Duplicate Values in df_ords

In [59]:
# Check for duplicate values
duplicates = df_ords.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

Number of duplicate rows: 0


# Check for Duplicate Values in df_prods

In [123]:
# Check for duplicate values
duplicates = df_prods.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

Number of duplicate rows: 0


# identify dupliacte rows

In [125]:
# Display duplicate rows
df_prods[df_prods.duplicated()]

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices


# Handling Duplicate Values

Upon checking the df_prods DataFrame for duplicate rows, we found 5 duplicates:

	product_id	product_name	aisle_id	department_id	prices
462	462	Fiber 4g Gummy Dietary Supplement	70	11	4.8
18459	18458	Ranger IPA	27	5	9.2
26810	26808	Black House Coffee Roasty Stout Beer	27	5	13.4
35309	35306	Gluten Free Organic Peanut Butter & Chocolate ...	121	14	6.8
35495	35491	Adore Forever Body Wash	127	1

The next step is to drop these duplicate rows to ensure our data is clean.1	9.9

In [65]:
# Drop duplicate rows and keep the first occurrence
df_prods = df_prods.drop_duplicates()

# Verify that duplicates have been removed
duplicates_after = df_prods.duplicated().sum()
print(f"Number of duplicate rows after removal: {duplicates_after}")

Number of duplicate rows after removal: 0


In [95]:
missing_df_prods = df_prods.isnull().sum()
print("Missing values in combined DataFrame:")
print(missing_df_prods)

Missing values in combined DataFrame:
product_id       0
product_name     0
aisle_id         0
department_id    0
prices           0
dtype: int64


In [102]:
 df_prods.shape

(49688, 5)

In [75]:
missing_df_ords = df_ords .isnull().sum()
print("Missing values in combined DataFrame:")
print(missing_df_ords)

Missing values in combined DataFrame:
order_id                  0
user_id                   0
order_number              0
order_dow                 0
order_hour_of_day         0
days_since_prior_order    0
dtype: int64


In [73]:
df_ords.drop('eval_set', axis=1, inplace=True)

# Exporting CSV Files

In [97]:

df_ords.to_csv(r"C:\Users\marci\15-07-2024Instacart Basket Analysis\02 Data\Prepared Data\orders_wrangled.csv")
df_prods.to_csv(r"C:\Users\marci\15-07-2024Instacart Basket Analysis\02 Data\Prepared Data\products_wrangled.csv", index=False, encoding='utf-8')


print(f"Cleaned data has been exported to:\n- {orders_wrangled_path}\n- {products_wrangled_path}")

Cleaned data has been exported to:
- C:\Users\marci\15-07-2024Instacart Basket Analysis\02 Data\Prepared Data\orders_wrangled.csv
- C:\Users\marci\15-07-2024Instacart Basket Analysis\02 Data\Prepared Data\products_wrangled.csv
