# 1. Importing libraries

In [3]:
# Import libraries
import pandas as pd
import numpy as np
import os

# 2. Importing datasets

In [68]:
# Project folder path 
path = r'C:\Users\hp\08-2024 Instacart Basket Analysis\Data'
# Importing the orders dataset 
df_ords = pd.read_csv(os.path.join(path, 'Prepared Data', 'orders_wrangled.csv'), index_col =False)
# Importing Products dataset 
df_prods = pd.read_csv(os.path.join(path, 'Original Data', 'products.csv'), index_col = False)
# Importing departments dataset 
df_dep = pd.read_csv(os.path.join(path, 'Prepared data', 'departments_wrangled.csv'), index_col =False)

# 3. The consistency checks covered in the Exercise on df_prods dataframe

## 3.1 Mixed-Type Data

### Instacart data is already preped so we will practice fixing mixed-type data by creating a small test dataframe

In [13]:
#Create a dataframe
df_test = pd.DataFrame()
#Create a mixed type column
df_test['mix'] = ['a', 'b', 1,True]
#Check for mixed column 
for col in df_test.columns.tolist():
  weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_test[weird]) > 0:
    print (col)

mix


  weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)


In [15]:
#Correction
df_test['mix'] = df_test['mix'].astype('str')

## 3.2 Missing Values 

In [18]:
#Check missing values products dataset
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

### the only column with missing values is the "product_name" column, and it’s missing 16 values.

In [25]:
#Checking the number of rows before dealing with the missing values 
df_prods.shape

(49693, 5)

In [27]:
#Creating a new DataFrame df_prods_clean that contains only the rows where product_name is not null.
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

#We could use : df_prods_clean = df_prods.dropna(subset=['product_name'])
df_prods_clean.shape

(49677, 5)

#### We have exactly 16 rows less, we excluded the missing values

## 3.3 Duplicates

In [48]:
# Looking for full duplicates within the df_prods_clean
df_dups = df_prods_clean[df_prods_clean.duplicated()]
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [50]:
# Adressing duplicates
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()
df_prods_clean_no_dups.shape

(49672, 5)

#### The 5 duplicates have been successfully deleted!

## 3.4 Extra-check

In [143]:
df_prods_clean_no_dups.describe()

Unnamed: 0,product_id,aisle_id,department_id,prices
count,49672.0,49672.0,49672.0,49672.0
mean,24850.349775,67.762442,11.728942,9.993282
std,14340.705287,38.315784,5.850779,453.615536
min,1.0,1.0,1.0,1.0
25%,12432.75,35.0,7.0,4.1
50%,24850.5,69.0,13.0,7.1
75%,37268.25,100.0,17.0,11.1
max,49688.0,134.0,21.0,99999.0


**-->prices: Very high variability and right-skewed distribution, indicating a few exceptionally high prices.**

In [149]:
# Calculate the IQR
Q1 = df_prods_clean_no_dups['prices'].quantile(0.25)
Q3 = df_prods_clean_no_dups['prices'].quantile(0.75)
IQR = Q3 - Q1

# Determine the bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outliers
outliers = df_prods_clean_no_dups[df_prods_clean_no_dups['prices'] > upper_bound]

print("Outliers:")
print(outliers)
print(f"Number of outliers: {len(outliers)}")


Outliers:
       product_id                                       product_name  \
39             40  Beef Hot Links Beef Smoked Sausage With Chile ...   
83             84                                         Lamb Shank   
1932         1932                             Soy Chorizo Vegetarian   
1992         1992                       Beef (101445) Summer Sausage   
2005         2005                         Fresh Ground Turkey Breast   
...           ...                                                ...   
48343       48339                                      Lobster Tails   
48592       48588                         All Natural Ground Chicken   
48784       48780                              Olsen Pickled Herring   
48966       48962                       Hardwood Smoked Sliced Bacon   
49440       49436                              Imitation Crab Flakes   

       aisle_id  department_id  prices  
39          106             12    22.5  
83            7             12    24.3  
19

In [177]:
#It is hard to make any decision in that case, we will investigate a bit more :
outlier = df_prods_clean_no_dups[df_prods_clean_no_dups['prices'] == 99999]
print(outlier)

       product_id           product_name  aisle_id  department_id   prices
33666       33664  2 % Reduced Fat  Milk        84             16  99999.0


**Clearly it is an error and not an expensive product 'Milk'?!, we can remove it..**

In [167]:
# Define the extreme price value
extreme_price_value = 99999.0

# Create a new DataFrame excluding the outlier
df_prods_filtered = df_prods_clean_no_dups[df_prods_clean_no_dups['prices'] != extreme_price_value]

df_prods_filtered.describe()


Unnamed: 0,product_id,aisle_id,department_id,prices
count,49671.0,49671.0,49671.0,49671.0
mean,24850.172334,67.762115,11.728856,7.980256
std,14340.795118,38.3161,5.850806,66.952504
min,1.0,1.0,1.0,1.0
25%,12432.5,35.0,7.0,4.1
50%,24850.0,69.0,13.0,7.1
75%,37268.5,100.0,17.0,11.1
max,49688.0,134.0,21.0,14900.0


In [155]:
outlier2 = df_prods_clean_no_dups[df_prods_clean_no_dups['prices'] == 14900]
print(outlier2)

       product_id                      product_name  aisle_id  department_id  \
21554       21553  Lowfat 2% Milkfat Cottage Cheese       108             16   

        prices  
21554  14900.0  


**Is it an error too ? I can't decide it**

# 4. Taskwork 

## 4.1 Checking the orders dataframe

In [70]:
df_ords.describe()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710541.0,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,855270.5,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710541.0,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421082.0,3421083.0,206209.0,100.0,6.0,23.0,30.0


#### Insights :
**Unnamed:** This column doesn't hold meaningful data for analysis and could be dropped if it's just a redundant index.

**order_id:** nothing seems off here.

**user_id:** The user IDs range from 1 to about 2 million, which seems normal given the dataset size.

**order_number:** The maximum order_number being 100 could indicate a potential cap or limit in the dataset, but it’s not necessarily an issue.

**order_dow (Day of Week):** Values range from 0 to 6, which corresponds to the seven days of the week. This is consistent with expectations, and no issues are apparent here.

**order_hour_of_day:** The range of values is from 0 (midnight) to 23 (11 PM), which is expected for the hour of the day. Nothing unusual is noted.

**days_since_prior_order:** While most values seem reasonable, with a max of 30 days, the min being 0 could indicate consecutive orders or that some users place orders on the same day. This might be normal behavior but could warrant further exploration to ensure data accuracy.
*--> Review days_since_prior_order: Although the min value being 0 might make sense (e.g., same-day orders), it may be worth confirming that these cases are intentional and not due to data entry errors.*

## 4.2 Mixed-type data 

In [95]:
#Check for mixed column 
for col in df_ords.columns.tolist():
  weird = (df_ords[[col]].map(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords[weird]) > 0:
    print (col)
else : print ('None of the columns have mix data')      

None of the columns have mix data


## 4.3 Missing Values 

In [98]:
#Check missing values orders dataset
df_ords.isnull().sum()

Unnamed: 0                     0
order_id                       0
user_id                        0
order_number                   0
order_dow                      0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

#### The only column with missing values is the "days_since_prior_order" column, and it’s missing 206209 values (6%)
The "days_since_prior_order" column likely represents the number of days since a user's last order. The missing values in this column could be explained by the following reasons: 
First-Time Orders:
if a user places an order for the first time, there wouldn't be any prior data to calculate the gap in days between orders.


### 4.3.1 Steps to Check if Missing Values Indicate First-Time Orders:

In [117]:
#Identify Missing Values:
missing_values = df_ords[df_ords['days_since_prior_order'].isnull()]
#Check the Order Number:
first_time_orders = missing_values[missing_values['order_number'] == 1]
#Report Findings:
num_first_time_orders = len(first_time_orders)
total_missing = len(missing_values)
print(f"Out of {total_missing} missing values in 'days_since_prior_order', {num_first_time_orders} are first-time orders.")


Out of 206209 missing values in 'days_since_prior_order', 206209 are first-time orders.


#### Insights:
All 206,209 missing values in the "days_since_prior_order" column are from first-time orders.
These missing values occur because there’s no previous order to compare against for these first-time orders, so the system naturally leaves this field blank.

### 4.3.2 Addressing the missing values

Imputing with a numeric value could mislead the analysis, as it would incorrectly suggest that first-time orders have a specific time since their prior order.
--> maintaining the integrity of the dataset, it is better to either leave them as NaN or use a specific code to indicate the absence of prior order data

**--> Leaving missing values as NaN as it clearly represents missing or not applicable data.**

## 4.4 Duplicates 

In [133]:
# Looking for full duplicates within the df_prods_clean
df_dup = df_ords[df_ords.duplicated()]
df_dup

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order


**The dataframe created to check duplicates has returned empty, meaning there are no duplicate**

# 5. Exporting dataframes 

In [175]:
#Products dataframe:
df_prods_filtered.to_csv(os.path.join(path,'Prepared Data', 'prods_cleaned.csv'))
#Orders dataframe remain the same