# 1. Importing libraries

In [1]:
import pandas as pd
import numpy as np 
import os 

# 2. Importing datasets

In [3]:
# Project folder path 
path = r'C:\Users\hp\08-2024 Instacart Basket Analysis\Data'
# Importing the orders dataset 
df_ords = pd.read_csv(os.path.join(path, 'Original Data', 'orders.csv'), index_col=False)
# Importing Products dataset 
df_prods = pd.read_csv(os.path.join(path, 'Original Data', 'products.csv'), index_col= False)
# Importing departments dataset 
df_dep = pd.read_csv(os.path.join(path, 'Original data', 'departments.csv'), index_col=False)

# 3. Wragling data

## 3.1. Orders dataframe

In [5]:
#Dropping eval_set column from df_ords
df_ords=df_ords.drop(columns = ['eval_set'])

In [7]:
#changing data type for order and user ID in df_ords
df_ords['order_id'] = df_ords['order_id'].astype('str')
df_ords['user_id'] = df_ords['user_id'].astype('str')

In [41]:
#Checking if the types changed 
df_ords.dtypes

order_id                   object
user_id                    object
order_number                int64
order_dow                   int64
order_hour_of_day           int64
days_since_prior_order    float64
dtype: object

## 3.2. Departments dataframe

In [9]:
#Transposing df_dep
df_dep_t=df_dep.T

In [26]:
#Take the first row of df_dep_t for the header
new_header = df_dep_t.iloc[0]

In [28]:
new_header

0    department
Name: department_id, dtype: object

In [30]:
#Take the data under the header row for a new df 
df_dep_t_new = df_dep_t[1:]

In [32]:
#set the header row as the df header
df_dep_t_new.columns = new_header

In [34]:
df_dep_t_new

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


# 4. Data Dictionary

In [37]:
data_dict = df_dep_t_new.to_dict('index')

# 5. Task answers 

## 5.1 another identifier variable in the df_ords dataframe that doesn’t need to be included as a numeric variable and change it to a suitable format:

In [45]:
df_ords['order_number'] = df_ords['order_number'].astype('str')

## 5.2 a variable in your df_ords dataframe with an unintuitive name and change its name without overwriting the dataframe.

In [49]:
#Using 'inplace = False' to not overwrite the dataframe. 
df_ords.rename(columns = {'order_dow' : 'orders_day_of_the_week'}, inplace = False)

Unnamed: 0,order_id,user_id,order_number,orders_day_of_the_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
...,...,...,...,...,...,...
3421078,2266710,206209,10,5,18,29.0
3421079,1854736,206209,11,4,10,30.0
3421080,626363,206209,12,1,12,18.0
3421081,2977660,206209,13,1,12,7.0


## 5.3 The busiest hour is for placing orders

In [54]:
df_ords['order_hour_of_day'].value_counts(dropna = False)

order_hour_of_day
10    288418
11    284728
15    283639
14    283042
13    277999
12    272841
16    272553
9     257812
17    228795
18    182912
8     178201
19    140569
20    104292
7      91868
21     78109
22     61468
23     40043
6      30529
0      22758
1      12398
5       9569
2       7539
4       5527
3       5474
Name: count, dtype: int64

### Insights: 
Busiest Hour: 10 AM is the peak hour for orders.

Active Period: The most active period spans from 10 AM to 3 PM.

Decline: A gradual decline in orders occurs after 3 PM, with a sharp drop-off in the evening and night.

Quietest Period: The early morning hours (12 AM to 6 AM) see the least activity.

## 5.4 The meaning behind a value of 4 in the "department_id" column within the df_prods dataframe using a data dictionary.

In [59]:
# We have already turned df_dep_t_new dataframe into a dictionary
print(data_dict.get('4'))

{'department': 'produce'}


### Insight:
The department_id "4" corresponds to the category "produce".

## 5.5 The sales team wants to know more about breakfast item sales.

In [63]:
#Identifying the Breakfast id : 
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

--> The breakfast id is : '14' 

In [66]:
#Creating the subset: 
df_breakfast =  df_prods[df_prods['department_id']==14]

In [68]:
df_breakfast

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
27,28,Wheat Chex Cereal,121,14,10.1
33,34,,121,14,12.2
67,68,"Pancake Mix, Buttermilk",130,14,13.7
89,90,Smorz Cereal,121,14,3.9
210,211,Gluten Free Organic Cereal Coconut Maple Vanilla,130,14,3.6
...,...,...,...,...,...
49330,49326,Cereal Variety Fun Pack,121,14,9.1
49395,49391,Light and Fluffy Buttermilk Pancake Mix,130,14,2.0
49547,49543,Chocolate Cheerios Cereal,121,14,10.8
49637,49633,Shake 'N Pour Buttermilk Pancake Mix,130,14,14.2


## 5.6 details about products that customers might use to throw dinner parties

#### Find all observations from the entire dataframe that include items from the following departments: alcohol, deli, beverages, and meat/seafood

##### Identifying the ids:
alcohol id : *5*,
deli id : *20*, 
beverages id : *7*,
meat/seafood id : *12*

In [77]:
#creating "dinner party" subset:
df_dinner_party = df_prods.loc[df_prods['department_id'].isin([5,20,7,12])]

In [80]:
#Checking the results: 
df_dinner_party

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
6,7,Pure Coconut Water With Orange,98,7,4.4
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,8.4
10,11,Peach Mango Juice,31,7,2.8
16,17,Rendered Duck Fat,35,12,17.1
...,...,...,...,...,...
49676,49672,Cafe Mocha K-Cup Packs,26,7,6.5
49679,49675,Cinnamon Dolce Keurig Brewed K Cups,26,7,14.0
49680,49676,Ultra Red Energy Drink,64,7,14.5
49686,49682,California Limeade,98,7,4.3


#### How many rows does the last dataframe have?
7650 rows 

(we could use df_dinner_party.shape)


# 5.7 Informations about customer with a "user_id" of “1.”

In [91]:
df_user_1 = df_ords.loc[df_ords['user_id'] == '1']

In [93]:
#Results: 
df_user_1

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
5,3367565,1,6,2,7,19.0
6,550135,1,7,1,9,20.0
7,3108588,1,8,1,14,14.0
8,2295261,1,9,1,16,0.0
9,2550362,1,10,4,8,30.0


In [98]:
#Basic stats:
df_user_1.describe()

Unnamed: 0,order_dow,order_hour_of_day,days_since_prior_order
count,11.0,11.0,10.0
mean,2.636364,10.090909,19.0
std,1.286291,3.477198,9.030811
min,1.0,7.0,0.0
25%,1.5,7.5,14.25
50%,3.0,8.0,19.5
75%,4.0,13.0,26.25
max,4.0,16.0,30.0


#### Insights:
##### Total Orders: 
The user has placed a total of 11 orders.
##### Order Pattern: 
The user most commonly places orders between Monday and Thursday, with Wednesday and Thursday being the most frequent.
##### Order Timing: 
The user tends to order in the morning, typically around 8 AM, with some orders extending into the early afternoon.
##### Order Frequency:
The user generally places orders every 19 days on average, with some variation from 0 to 30 days. The time between orders varies but clusters around two to four weeks.
##### The user exhibits a consistent pattern of placing orders on weekday mornings, especially around mid-week, every two to four weeks. This pattern suggests a routine driven by weekly planning and a flexible ordering schedule based on varying needs.



# 6. Exporting dataframes

In [105]:
#Orders dataframe:
df_ords.to_csv(os.path.join(path,'Prepared Data', 'orders_wrangled.csv'))

In [107]:
#Departments dataframe:
df_dep_t_new.to_csv(os.path.join(path,'Prepared Data', 'departments_wrangled.csv'))