# 01. Importing Libraries

In [17]:
# Import libraries
import pandas as pd
import numpy as np
import os

# 02. Importing Data

In [18]:
path = r'C:\Users\rajkaran\Documents\CareerFoundry- Data Analytics Program\2. Data Immersion\Achievements 4 - Tasks\07-06-2024 Instacart Basket Analysis\02 Data'

In [19]:
print(path)

C:\Users\rajkaran\Documents\CareerFoundry- Data Analytics Program\2. Data Immersion\Achievements 4 - Tasks\07-06-2024 Instacart Basket Analysis\02 Data


In [20]:
# Reading CSV files orders & products
df_ords = pd.read_csv (os.path.join(path,'Prepared Data','orders_wrangled.csv'), index_col = False)
df_prods = pd.read_csv (os.path.join(path,'Original Data','products.csv'), index_col = False)

# 03. Mixed-Type Data

In [90]:
# The current dataframes don't have any mixed-type columns so create one to work with
# Create a dataframe

df_test = pd.DataFrame()

In [28]:
# Create a mixed type column - creates a new column, mix, within df_test and fills it with numeric, string, and boolean values.
df_test['mix'] = ['a', 'b', 1, True]

In [47]:
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


In [88]:
# When you've reached a decision, the following code can be executed
df_test['mix'] = df_test['mix'].astype('str')

In [55]:
#function for checking whether a dataframe contains any mixed-type columns. Apply map function has been deprecated in the panda 
#library so using map function. 

for col in df_test.columns.tolist():
  weird = (df_test[[col]].map(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_test[weird]) > 0:
    print (col)

# 04. Missing Values

In [91]:
# Missing data can be attributed to two reasons: data corruption or the data was never recorded in the first place

In [56]:
#to find missing observations or enteries
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

### The only column with missing values is the "product_name" column, and it’s missing 16 values

In [57]:
# To actually view these 16 values, you can create a subset of the dataframe containing only the values in question.
# Create a new dataframe, df_nan, containing only those values within the "product_name" column that meet the condition isnull() = True. 

df_nan = df_prods[df_prods['product_name'].isnull() == True] 

In [58]:
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [95]:
# Rows with missing values. In df_nan missing values here are product names, which are strings & String values can’t be imputed like numeric values. 
#You can either remove the missing values entirely or filter out the ones that aren’t missing into a subset dataframe and continue your analysis with 
#this new dataframe. 

# Addressing Missing Values

In [96]:
# There are several ways to deal with missing data:
# 1. Create a new variable that acts like a flag based on the missing value
# 2. Impute the value with the mean or median of the column (if the variable is numeric)
# 3. Remove or filter out the missing data

In [97]:
# If you choose to impute using the mean, use the following code to replace the missing values:
# df['column with missings'].fillna(mean value, inplace=True)
# If you choose to impute using the median, use the following code to replace the missing values:
# df['column with missings'].fillna(median value, inplace=True)

In [98]:
# Looking at df_nan it's clear imputation isn't an option because the data-type is a string
# You can either remove the missing values entirely or filter the non-missing values into a subset dataframe

In [64]:
#Run this function to compare the number of rows in your current dataframe with the number in your subset once the missing rows have been removed
df_prods.shape

(49693, 5)

In [67]:
# Create a new dataframe with rows not having any missing values. ie. clean dataframe

df_prods_clean = df_prods[df_prods['product_name'].isnull() == False] 

In [100]:
df_prods_clean

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3
...,...,...,...,...,...
49688,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5,5.3
49689,49685,En Croute Roast Hazelnut Cranberry,42,1,3.1
49690,49686,Artisan Baguette,112,3,7.8
49691,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8,4.7


In [69]:
df_prods.shape

(49693, 5)

In [101]:
# Check the number of rows again to make sure it decreased
df_prods_clean.shape

(49677, 5)

In [71]:
# Another way you can drop all missing values is via the following command:

df_prods.dropna(inplace = True)

In [73]:
# If you wanted to use this command to drop only the NaNs from a particular column, the code would look like this:

df_prods.dropna(subset = ['product_name'], inplace = True)

In [102]:
# In these cases, rather than creating a new dataframe, you're overwriting df_prods with a new version of df_prods that doesn't contain missing values
# This is done using the 'inplace = True' function, which overwrites the dataframe
# If you don't specift an 'inplace' argument, the default is False which only returns a view of the changed dataframe

# 05. Duplicates

In [76]:
#The following command will look for full duplicates within your dataframe:

df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [78]:
#calling the df_dups dataframe this will display all the duplicate rows within your dataframe
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


# Addressing Duplicates

In [103]:
# Once the duplicates are identified, they must be removed using 'df.drop_duplicates()'

In [81]:
# To check the no. of rows
df_prods_clean.shape

(49677, 5)

In [83]:
#create a new dataframe that doesn’t include the duplicates

df_prods_clean_no_dups = df_prods_clean.drop_duplicates()


In [84]:
df_prods_clean_no_dups.shape

(49672, 5)

#### Now have 49,672 rows in the dataframe. The five duplicates have been successfully deleted

# 06. Exporting Changes

In [86]:
df_prods_clean_no_dups.to_csv(os.path.join(path, 'Prepared Data', 'products_checked.csv'))

# Exercise Tasks-4.5

## 2. Run the df.describe() function on your df_ords dataframe. Interpret the output of this function

In [104]:
df_ords.describe()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710541.0,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,855270.5,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710541.0,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421082.0,3421083.0,206209.0,100.0,6.0,23.0,30.0


In [105]:
# Lets look at the values for 'order_number', 'orders_day_of_week', and 'days_since_prior_order'. 
# 'order_number' has a maximum value of 100, which seems very high depending on the time frame
# 'orders_day_of_week' has a minimum of '0' and a maximum of '6' suggesting the days are zero-indexed (0 for Monday or Sunday)
# 'days_since_prior_order' has a maximum value of 30 may suggest orders are capped monthly
# Now, let's look at zero values for 'days_since_prior_order'
# The minimum value for 'days_since_prior_order' is 0, suggesting multiple orders on the same day or a placeholder for customers' first orders
# The count is also lower compared to the other columns, suggesting missing data
# Finally, 'order_id', 'user_id', and 'order_number' should be considered string values instead of numerical values.

## 3. Check for mixed-type data in your df_ords dataframe

In [116]:
# Check for mixed-type data in df_ords dataframe

for col in df_ords.columns.tolist():
  weird = (df_ords[[col]].map(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords[weird]) > 0:
    print (col)

In [119]:
# No mixed-type data was found

In [107]:
# There's no mixed data present in the df_ords dataframe
df_ords.dtypes

Unnamed: 0                  int64
order_id                    int64
user_id                     int64
order_number                int64
orders_day_of_week          int64
order_hour_of_day           int64
days_since_prior_order    float64
dtype: object

## 05. Run a check for missing values in your df_ords dataframe

In [108]:
df_ords.isnull().sum()

Unnamed: 0                     0
order_id                       0
user_id                        0
order_number                   0
orders_day_of_week             0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

In [122]:
# The 'days_since_prior_order' column is missing 206209 values. These are probably first orders placed by customers, therefore any order placed for 
#the first time will have a missing value in the 'days_since_prior_order' column.
#Removing the missing values would result in the loss of important information about customers who just made their first order, 
#therefore here are the steps of how I plan to address the issue:
#1. Checking the frequency of missing values is the same as the frequency of order_number=1.
#2. Create a new column that flags new orders (True or False).


##  06. Address the missing values using an appropriate method

In [131]:
# Create subset of the dataframe that contains the nulls
df_ords_nan = df_ords[df_ords['days_since_prior_order'].isnull() == True]

In [132]:
df_ords_nan

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,0,2539329,1,1,2,8,
11,11,2168274,2,1,2,11,
26,26,1374495,3,1,1,14,
39,39,3343014,4,1,6,11,
45,45,2717275,5,1,3,12,
...,...,...,...,...,...,...,...
3420930,3420930,969311,206205,1,4,12,
3420934,3420934,3189322,206206,1,3,18,
3421002,3421002,2166133,206207,1,6,19,
3421019,3421019,2227043,206208,1,1,15,


In [138]:
df_ords.fillna({'days_since_prior_order': 7}, inplace=True)

In [141]:
df_ords[df_ords['order_number']==1]

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,0,2539329,1,1,2,8,7.0
11,11,2168274,2,1,2,11,7.0
26,26,1374495,3,1,1,14,7.0
39,39,3343014,4,1,6,11,7.0
45,45,2717275,5,1,3,12,7.0
...,...,...,...,...,...,...,...
3420930,3420930,969311,206205,1,4,12,7.0
3420934,3420934,3189322,206206,1,3,18,7.0
3421002,3421002,2166133,206207,1,6,19,7.0
3421019,3421019,2227043,206208,1,1,15,7.0


## 07.  Run a check for duplicate values in your df_ords data

In [142]:
df_ords_dup = df_ords[df_ords.duplicated()]

In [143]:
df_ords_dup

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order


In [144]:
# The dataframe created to check duplicates has returned empty, meaning there are no duplicates in the data set

## Export your final, cleaned df_prods and df_ords data as “.csv” files in your “Prepared Data” folde

In [147]:
df_ords.to_csv(os.path.join(path, 'Prepared Data', 'orders_consistency_check_cleaned.csv'))
df_prods.to_csv(os.path.join(path, 'Prepared Data', 'product_consistency_check_cleaned.csv'))