# This script contains the following:
- Import of libraries
- Load datasets of Orders, Products & Depatments datasets
- Data manipulation on Orders dataset
- Data manipulation on Departments dataset
- Create a Data dictionary
- Create a Subset for snacks department
- Find the busiest hour for ordering
- Determine a value of 4 in the "department_id" column
- Create different subsets
- Get infos abour a user
- Export


# Import libraries

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

# Load files

In [2]:
# Define paths and filenames
data_path = r'/Users/macbook/Dropbox/Mac/Documents/Pro/Data_Analyst/Course_Career_foundry/A4_Python/2023.08_Instacart_basket_analysis/02_data'
orders_filename = 'orders.csv'
products_filename = 'products.csv'
departments_filename = 'departments.csv'

# Construct full file paths
orders_file_path = os.path.join(data_path, 'original data', orders_filename)
products_file_path = os.path.join(data_path, 'original data', products_filename)
departments_file_path = os.path.join(data_path, 'original data', departments_filename)

# Import dataframes
df_ords = pd.read_csv(orders_file_path, index_col=False)
df_prods = pd.read_csv(products_file_path, index_col=False)
df_dep = pd.read_csv(departments_file_path, index_col=False)

## Checking loading process

In [3]:
#checking orders dataframe load
df_ords.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [4]:
#checking products dataframe load
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [5]:
#checking departement dataframe load
df_dep.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 22 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   department_id  1 non-null      object
 1   1              1 non-null      object
 2   2              1 non-null      object
 3   3              1 non-null      object
 4   4              1 non-null      object
 5   5              1 non-null      object
 6   6              1 non-null      object
 7   7              1 non-null      object
 8   8              1 non-null      object
 9   9              1 non-null      object
 10  10             1 non-null      object
 11  11             1 non-null      object
 12  12             1 non-null      object
 13  13             1 non-null      object
 14  14             1 non-null      object
 15  15             1 non-null      object
 16  16             1 non-null      object
 17  17             1 non-null      object
 18  18             1 non-null      obj

# Data manipulation

# Manipulation on Orders dataframe

In [6]:
# Drop the 'eval_set' column from the 'df_ords' DataFrame
df_ords = df_ords.drop(columns=['eval_set'])

# Drop a column and count unique values including missing values
order_counts = df_ords['days_since_prior_order'].value_counts(dropna=False)

# Rename the 'order_dow' column to 'orders_day_of_week' in-place
df_ords.rename(columns={'order_dow': 'orders_day_of_week'}, inplace=True)

# Convert the 'order_id' column to string data type
df_ords['order_id'] = df_ords['order_id'].astype(str)

# Check the data type of the 'order_id' column
order_id_dtype = df_ords['order_id'].dtype


In [7]:
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


# Manipulation on Department dataframe

## Transpose department dataframe

In [8]:
# Transpose the DataFrame
df_dep = df_dep.T

# Display the transposed DataFrame
df_dep


Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


## Create index & new header

In [9]:
# Reset the index, creating a new default integer index
df_dep = df_dep.reset_index()

# Create a new header using the first row of the transposed DataFrame
new_header = df_dep.iloc[0]

# Display the new header
new_header

index    department_id
0           department
Name: 0, dtype: object

In [10]:
# Checking
df_dep

Unnamed: 0,index,0
0,department_id,department
1,1,frozen
2,2,other
3,3,bakery
4,4,produce
5,5,alcohol
6,6,international
7,7,beverages
8,8,pets
9,9,dry goods pasta


## Removing useless 1st row

In [11]:
# Create a new DataFrame without the first row (which was the old header)
df_dep = df_dep[1:]

# Set the column names of the new DataFrame to the values in the new header
df_dep.columns = new_header
df_dep


Unnamed: 0,department_id,department
1,1,frozen
2,2,other
3,3,bakery
4,4,produce
5,5,alcohol
6,6,international
7,7,beverages
8,8,pets
9,9,dry goods pasta
10,10,bulk


# Create a Data dictionary

In [12]:
# Convert the 'df_dep' DataFrame to a dictionary with 'index' as the key
data_dict = df_dep.to_dict('index')
data_dict

{1: {'department_id': '1', 'department': 'frozen'},
 2: {'department_id': '2', 'department': 'other'},
 3: {'department_id': '3', 'department': 'bakery'},
 4: {'department_id': '4', 'department': 'produce'},
 5: {'department_id': '5', 'department': 'alcohol'},
 6: {'department_id': '6', 'department': 'international'},
 7: {'department_id': '7', 'department': 'beverages'},
 8: {'department_id': '8', 'department': 'pets'},
 9: {'department_id': '9', 'department': 'dry goods pasta'},
 10: {'department_id': '10', 'department': 'bulk'},
 11: {'department_id': '11', 'department': 'personal care'},
 12: {'department_id': '12', 'department': 'meat seafood'},
 13: {'department_id': '13', 'department': 'pantry'},
 14: {'department_id': '14', 'department': 'breakfast'},
 15: {'department_id': '15', 'department': 'canned goods'},
 16: {'department_id': '16', 'department': 'dairy eggs'},
 17: {'department_id': '17', 'department': 'household'},
 18: {'department_id': '18', 'department': 'babies'},
 

In [13]:
print(data_dict.get('19'))


None


# Create a Subset for snacks (2 different ways)

In [14]:
# Create a subset for df_prods
df_snacks =  df_prods[df_prods['department_id']==19]
df_snacks.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5


In [15]:
# alternative with loc fn
df_snacks_2 = df_prods.loc[df_prods['department_id'] == 19]
df_snacks_2.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5


# Data manipulation in Orders data frame

In [16]:
# checking the data types
df_ords.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 6 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                object 
 1   user_id                 int64  
 2   order_number            int64  
 3   orders_day_of_week      int64  
 4   order_hour_of_day       int64  
 5   days_since_prior_order  float64
dtypes: float64(1), int64(4), object(1)
memory usage: 156.6+ MB


In [17]:
#  Change user_id variable from int to string
df_ords['user_id'] = df_ords['user_id'].astype('str')
#checking results
df_ords.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 6 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                object 
 1   user_id                 object 
 2   order_number            int64  
 3   orders_day_of_week      int64  
 4   order_hour_of_day       int64  
 5   days_since_prior_order  float64
dtypes: float64(1), int64(3), object(2)
memory usage: 156.6+ MB


In [18]:
#checking variables names
df_ords.columns

# order_number is not clear. lets call it number_order_client
df_ords.rename(columns = {'order_number' : 'number_order_client'}, inplace = True)
df_ords.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 6 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                object 
 1   user_id                 object 
 2   number_order_client     int64  
 3   orders_day_of_week      int64  
 4   order_hour_of_day       int64  
 5   days_since_prior_order  float64
dtypes: float64(1), int64(3), object(2)
memory usage: 156.6+ MB


# Find the busiest hour for ordering

In [20]:
## counts of orders per hour
busiest_hour = df_ords['order_hour_of_day'].value_counts() 
busiest_hour

order_hour_of_day
10    288418
11    284728
15    283639
14    283042
13    277999
12    272841
16    272553
9     257812
17    228795
18    182912
8     178201
19    140569
20    104292
7      91868
21     78109
22     61468
23     40043
6      30529
0      22758
1      12398
5       9569
2       7539
4       5527
3       5474
Name: count, dtype: int64

### busiest_hour is 10 AM with 288 418 orders

# Create a subset for breakfast items

In [22]:
# Create a subset for breakfast items only
df_breakfast =  df_prods[df_prods['department_id']==4]
df_breakfast.head()


Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
30,31,White Pearl Onions,123,4,7.5
42,43,Organic Clementines,123,4,11.5
44,45,European Cucumber,83,4,14.3
65,66,European Style Spring Mix,123,4,11.7
88,89,Yogurt Fruit Dip Sliced Apples,123,4,12.6


# Creating a subset for party dinner

In [23]:
# products in dinner category: we have 7650 products, price ranging: 1 to 25
# following departments: alcohol, deli, beverages, and meat/seafood
df_dep

Unnamed: 0,department_id,department
1,1,frozen
2,2,other
3,3,bakery
4,4,produce
5,5,alcohol
6,6,international
7,7,beverages
8,8,pets
9,9,dry goods pasta
10,10,bulk


In [24]:
# defining the dinner's department
dinner_department = [5,7,12,20]
df_dinner = df_prods[df_prods['department_id'].isin(dinner_department)]
df_dinner.describe()


Unnamed: 0,product_id,aisle_id,department_id,prices
count,7650.0,7650.0,7650.0,7650.0
mean,24721.196601,66.286536,9.563268,9.012458
std,14297.565684,36.84458,5.114123,4.997438
min,3.0,1.0,5.0,1.0
25%,12402.5,28.0,7.0,5.0
50%,24803.0,77.0,7.0,8.8
75%,36977.25,98.0,12.0,12.4
max,49684.0,134.0,20.0,25.0


In [25]:
df_dinner.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7650 entries, 2 to 49688
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   product_id     7650 non-null   int64  
 1   product_name   7647 non-null   object 
 2   aisle_id       7650 non-null   int64  
 3   department_id  7650 non-null   int64  
 4   prices         7650 non-null   float64
dtypes: float64(1), int64(3), object(1)
memory usage: 358.6+ KB


the last df has 7650 rows.


# Get infos about User_id = 1

In [26]:
#defining user1
user1 = df_ords[df_ords['user_id']=='1']
user1

#getting infos
user1.describe()


Unnamed: 0,number_order_client,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,11.0,11.0,11.0,10.0
mean,6.0,2.636364,10.090909,19.0
std,3.316625,1.286291,3.477198,9.030811
min,1.0,1.0,7.0,0.0
25%,3.5,1.5,7.5,14.25
50%,6.0,3.0,8.0,19.5
75%,8.5,4.0,13.0,26.25
max,11.0,4.0,16.0,30.0


### NOTE: this provides the number of order, days & hours of orders, the average order and how long since the customer did the last order.

# Export Data

In [27]:
# Export df_ords wrangled data to prepared file directory
df_ords.to_csv(os.path.join(data_path,'prepared data', 'orders_wrangled.csv'))

# Export new dataframe df_dep_t_new
df_dep.to_csv(os.path.join(data_path,'prepared data', 'departments_wrangled.csv'))