# Importing important libraries

In [2]:
import pandas as pd  # For DataFrames
import numpy as np  # For numeric calculations
import os  # For file management

# Import CSV files into Pandas

In [14]:
# data set path
path = r"/Users/martin/anaconda_projects/11-02-2025 Instacart Basket Analysis"

In [22]:
# "products" data set
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)

In [24]:
# "orders" data set
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_wrangled.csv'), index_col = False)

#

In [32]:
# The df.describe() function returns descriptive statistics for the numeric values in your dataframe.
df_ords.describe()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_po
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710541.0,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,855270.5,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710541.0,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421082.0,3421083.0,206209.0,100.0,6.0,23.0,30.0


#

# Mixed-Type Data

In [37]:
# Create a dataframe
df_test = pd.DataFrame()

In [39]:
# Create a mixed type column 
df_test['mix'] = ['a', 'b', 1, True]

In [47]:
# Use head() function to see the new mixed-typed column:
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


In [51]:
# Check for mixed types
for col in df_test.columns.tolist():
  weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_test[weird]) > 0:
    print (col)

mix


  weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)


In [53]:
df_test['mix'] = df_test['mix'].astype('str')

#

# Missing Values

## Finding Missing Values

In [57]:
# To fix missing values, you first need to be able to find them.
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [69]:
# That means product_name is missing 16 values!

In [65]:
# To view these 16 values, create a subset of the dataframe containing the values in question
df_nan = df_prods[df_prods["product_name"].isnull() == True]

In [67]:
# To check the results
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


#

## Addressing Missing Values

In [73]:
# There are a few ways to deal with missing data:

# 1) Create a new variable that acts like a flag based on the missing value.

# 2) Impute the value with the mean or median of the column (if the variable is numeric).
# Mean = df.describe() / df['column with missings'].fillna(mean value, inplace=True)
# Median = df_prods.median() / df['column with missings'].fillna(median value, inplace=True)

# 3) Remove or filter out the missing data.

In [77]:
# 3) Before removing or filter out missing data
df_prods.shape # To see the current number of rows in the dataframe

(49693, 5)

In [79]:
# The current dataframe (before removing or filtering) has 49693 rows and 5 columns

In [82]:
# Next, create a new (clean) dataframe (SAME PROCEDURE AS ABOVE AT FINDING MISSING VALUES!)
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [84]:
df_prods_clean

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3
...,...,...,...,...,...
49688,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5,5.3
49689,49685,En Croute Roast Hazelnut Cranberry,42,1,3.1
49690,49686,Artisan Baguette,112,3,7.8
49691,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8,4.7


In [86]:
# The new "cleaned" dataframe shows 16 less rows!

In [88]:
# Another option to do this would be: df_prods.dropna(inplace = True)
# df_prods.dropna(subset = [‘product_name’], inplace = True) # to drop only the NaNs from a particular column!

#

# Duplicates

## Finding Duplicates

In [96]:
# The following command will look for full duplicates within your dataframe:
df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [119]:
# Executed the command. This will display all the duplicate rows within your dataframe.
# df_dups

In [102]:
# You’ve now located your duplicate rows. Great! All that’s left is to address them.

## Addressing Duplicates

In [121]:
# Now that you’ve identified the duplicate rows within your dataframe, you need to delete them.
# Command: df_dups.drop_duplicates()

In [129]:
# Before doing so, let’s check the current number of rows in your
df_prods_clean.shape

(49677, 5)

In [131]:
# Next, create a new dataframe that doesn’t include the duplicates you just identified using the drop_duplicates() function:
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [133]:
df_prods_clean_no_dups.shape

(49672, 5)

In [140]:
# there are exactly five fewer rows: The duplicates are gone!

#

# Tidying Up and Exporting Changes

In [None]:
# The export command takes the following syntax:
df_prods.to_csv(os.path.join(path, '02 Data','Prepared Data', 'products_checked.csv'))