# Preparing Notebook

In [1]:
# importing libraries
import pandas as pd
import numpy as np
import os

### Importing data frames

In [3]:
# creating path for import
path = r'C:\Users\lifti\OneDrive\CareerFoundry\Data Immersion\Achievement4\Instacart Basket Analysis'

In [5]:
# testing if path works
path

'C:\\Users\\lifti\\OneDrive\\CareerFoundry\\Data Immersion\\Achievement4\\Instacart Basket Analysis'

In [9]:
# importing orders.csv
df_ords = pd.read_csv (os.path.join(path,'02 Data','Original Data','orders.csv'), index_col = False)

In [12]:
# testing if import worked
df_ords.head ()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [14]:
# importing products.csv
df_prods = pd.read_csv (os.path.join (path, '02 Data', 'Original Data', 'products.csv'), index_col = False)

In [16]:
# testing if import worked
df_prods.head ()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


# Dropping Columns

In [18]:
# dropping columen eval_set
df_ords.drop (columns = ['eval_set'])

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
...,...,...,...,...,...,...
3421078,2266710,206209,10,5,18,29.0
3421079,1854736,206209,11,4,10,30.0
3421080,626363,206209,12,1,12,18.0
3421081,2977660,206209,13,1,12,7.0


In [19]:
# but now pandas did ONLY VISUALLY dropped 'eval_set',
# now we need to create a new dataframe where 'eval_set' is not included
# in this case we are overwriting the existing data frame
# But be cautions, it CAN NOT be undone
df_ords = df_ords.drop (columns = ['eval_set'])

In [37]:
# testing if column is dropped
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


In [None]:
# another possibilty would be to rename the dataframe with the dropped
# columns eg: df_ords_2 = df_ords.drop (...)

In [None]:
# dropping columns is necessary because sometimes not all the columns
# are needed for an analysis, or the colmuns has a lot of missing values

### Finding missing values

#### Missing values are displayed as NaN in Python

In [34]:
# finding missing values in 'days_since_prior_order'
# the easiest way is to count the same records in the column
df_ords['days_since_prior_order'].value_counts (dropna = False)

30.0    369323
7.0     320608
6.0     240013
4.0     221696
3.0     217005
5.0     214503
NaN     206209
2.0     193206
8.0     181717
1.0     145247
9.0     118188
14.0    100230
10.0     95186
13.0     83214
11.0     80970
12.0     76146
0.0      67755
15.0     66579
16.0     46941
21.0     45470
17.0     39245
20.0     38527
18.0     35881
19.0     34384
22.0     32012
28.0     26777
23.0     23885
27.0     22013
24.0     20712
25.0     19234
29.0     19191
26.0     19016
Name: days_since_prior_order, dtype: int64

In [38]:
# if one leaves away (dropna = False) the function will only show you the
# columns existing values
df_ords['days_since_prior_order'].value_counts

<bound method IndexOpsMixin.value_counts of 0           NaN
1          15.0
2          21.0
3          29.0
4          28.0
           ... 
3421078    29.0
3421079    30.0
3421080    18.0
3421081     7.0
3421082    30.0
Name: days_since_prior_order, Length: 3421083, dtype: float64>

In [43]:
# let's drop the variable -> this makes no sense
df_ords.value_counts

<bound method DataFrame.value_counts of          order_id  user_id  order_number  order_dow  order_hour_of_day  \
0         2539329        1             1          2                  8   
1         2398795        1             2          3                  7   
2          473747        1             3          3                 12   
3         2254736        1             4          4                  7   
4          431534        1             5          4                 15   
...           ...      ...           ...        ...                ...   
3421078   2266710   206209            10          5                 18   
3421079   1854736   206209            11          4                 10   
3421080    626363   206209            12          1                 12   
3421081   2977660   206209            13          1                 12   
3421082    272231   206209            14          6                 14   

         days_since_prior_order  
0                           NaN  
1  

In [45]:
# let's drop the ['variable'] but keep the (dropna = False) -> this makes no sense
df_ords.value_counts (dropna = False)

order_id  user_id  order_number  order_dow  order_hour_of_day  days_since_prior_order
1         112108   4             4          10                 9.0                       1
2280792   55460    8             1          5                  8.0                       1
2280716   186185   37            3          17                 7.0                       1
2280717   54889    11            2          16                 20.0                      1
2280718   190573   3             3          14                 11.0                      1
                                                                                        ..
1140365   153493   2             6          9                  22.0                      1
1140366   92210    24            5          14                 25.0                      1
1140367   62988    32            0          9                  2.0                       1
1140368   108388   20            4          15                 24.0                      1
3421

# Renaming Columns

In [55]:
# renaming 'order_dow' as 'odrder_day_of_week'
# inplace = True makes sure that the original column is overwritten and not copied
df_ords.rename (columns = {'order_dow' : 'order_day_of_week'}, inplace = True)

In [57]:
# lets try to make a copy of 'order_dow' but name it 'order_day_of_week'
df_ords.rename (columns = {'order_day_of_week' : 'order_day_of_week2'})

Unnamed: 0,order_id,user_id,order_number,order_day_of_week2,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
...,...,...,...,...,...,...
3421078,2266710,206209,10,5,18,29.0
3421079,1854736,206209,11,4,10,30.0
3421080,626363,206209,12,1,12,18.0
3421081,2977660,206209,13,1,12,7.0


In [60]:
# the function without (inplace = True) did rename 'order_day_of_week' to
# 'order_day_of_week2' but only visually. Testing did reveal that a 
# copy was NOT made and the renaming did also not happen
df_ords.head ()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


# Changing a Variable's Data Type

In [64]:
# some of the columns are not needed for descpriptive statistics
df_ords.describe ()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


In [69]:
# Checking the types of the columns
df_ords.dtypes

order_id                    int64
user_id                     int64
order_number                int64
order_day_of_week           int64
order_hour_of_day           int64
days_since_prior_order    float64
dtype: object

In [74]:
# chaging 'order_id' and 'user_id' from int64 to string/object
# again, these function can only be done with one variable
df_ords['order_id'] = df_ords['order_id'].astype('str')
df_ords['user_id'] = df_ords['user_id'].astype('str')

In [76]:
# checking if it did work
df_ords.dtypes

order_id                   object
user_id                    object
order_number                int64
order_day_of_week           int64
order_hour_of_day           int64
days_since_prior_order    float64
dtype: object

In [78]:
# checking if it did work
df_ords.describe ()

Unnamed: 0,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3214874.0
mean,17.15486,2.776219,13.45202,11.11484
std,17.73316,2.046829,4.226088,9.206737
min,1.0,0.0,0.0,0.0
25%,5.0,1.0,10.0,4.0
50%,11.0,3.0,13.0,7.0
75%,23.0,5.0,16.0,15.0
max,100.0,6.0,23.0,30.0


# Transposing Data

In [82]:
# import departments.csv
df_dep = pd.read_csv (os.path.join (path, '02 Data', 'Original Data', 'departments.csv'), index_col = False)

In [98]:
# testing if departments.csv was imported
df_dep

Unnamed: 0,department_id,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,department,frozen,other,bakery,produce,alcohol,international,beverages,pets,dry goods pasta,...,meat seafood,pantry,breakfast,canned goods,dairy eggs,household,babies,snacks,deli,missing


In [84]:
# testing if departments.csv was imported -> describe seems odd,
# because of count, unique, freq and because the function head only 
# showed department_id and words
df_dep.describe()

Unnamed: 0,department_id,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
count,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
unique,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
top,department,frozen,other,bakery,produce,alcohol,international,beverages,pets,dry goods pasta,...,meat seafood,pantry,breakfast,canned goods,dairy eggs,household,babies,snacks,deli,missing
freq,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [85]:
# Transposing df_dep (is itself easy, but what needs to be done such
# that it is workable is very difficult)
df_dep.T

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [86]:
# now we need to make sure, that the data frame is saved in the transposed way
# and naming the data frame differently 'df_dep_t'
df_dep_t = df_dep.T

In [88]:
# testing if transposition did work -> seems again odd, but also the
# count, unique, freq numbers did change
df_dep_t.describe ()

Unnamed: 0,0
count,22
unique,22
top,department
freq,1


In [99]:
# the data is transposed, with an odd 0 at the top
# and the second row seems to be the header and not the 0
df_dep_t

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


## Correcting the header of the data frame after transposing

#### 01. Creating an Index for the data frame

In [112]:
# creating index for df_dep_t
df_dep_t.reset_index()

Unnamed: 0,index,0
0,department_id,department
1,1,frozen
2,2,other
3,3,bakery
4,4,produce
5,5,alcohol
6,6,international
7,7,beverages
8,8,pets
9,9,dry goods pasta


#### 02. Create a new header

In [113]:
# tell python that row 0 should be the new header
new_header = df_dep_t.iloc[0]

In [117]:
# check if adding new header worked
df_dep_t.head()

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce


In [120]:
# it did not work as I believed it should work, because all we have done
# was creating a new varible (new_header) where we copied the contents
# of row 0 into
new_header

0    department
Name: department_id, dtype: object

#### 03. Remove the first row in the dataframe

In [121]:
# the removing needs to be done, because otherwise with the new header we
# would have the same data 2 times in the data frame
# To do that we have to copy just the data starting with index 1 into a
# new data frame
df_dep_t_new = df_dep_t[1:]

In [123]:
# checking if it worked
df_dep_t_new.head()

Unnamed: 0,0
1,frozen
2,other
3,bakery
4,produce
5,alcohol


#### 04. Add the new header

In [124]:
# copy the new variable (new_header) into the new data frame (df_dep_t_new)
df_dep_t_new.columns = new_header

In [125]:
# check if it worked
df_dep_t_new

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


In [126]:
# the only thing missing is an index (we created one for the data frame
# 'df_dep_t')

In [None]:
# Recap: 
# 1. Transpose 
# 2. add and index if needed 
# 3. Copy the header that we want frome the row into a new variable 
# 4. make a copy of the data frame without the
# row where the data is stored of the new header (we don't want to 
# have the data sets two times in the data frame) 
# 5. Insert the new variale = new header into the new data frame

# Data Dictionaries

In [127]:
df_prods

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3
...,...,...,...,...,...
49688,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5,5.3
49689,49685,En Croute Roast Hazelnut Cranberry,42,1,3.1
49690,49686,Artisan Baguette,112,3,7.8
49691,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8,4.7


In [132]:
# creating a data dictionary (basically creating a new varibale)
# ('index') means that we tell Python to use the numbered rows as the 
# key values for the entries in the dictionary
data_dict = df_dep_t_new.to_dict('index')

In [133]:
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [135]:
# testing the data dictionary
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [136]:
# testing the data dictionary
print(data_dict.get('19'))

{'department': 'snacks'}


# Subsetting

### Subsetting basically means filter data

In [139]:
# creating a new variable where only the relevant data is included
# df_prods[...['...']...] means Python should look into this dataframe
# ...[df_prods['department_id']...] means Python should look for a column
# ...[...['...']==19] means Python should look only for data with department_id 19
df_snacks = df_prods[df_prods['department_id']==19]

In [140]:
# step by step (right hand side)
df_prods['department_id']==19

0         True
1        False
2        False
3        False
4        False
         ...  
49688    False
49689    False
49690    False
49691    False
49692    False
Name: department_id, Length: 49693, dtype: bool

In [141]:
# step by step (right side of =)
df_prods[df_prods['department_id']==19]

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5
...,...,...,...,...,...
49666,49662,Bacon Cheddar Pretzel Pieces,107,19,3.6
49669,49665,Super Dark Coconut Ash & Banana Chocolate Bar,45,19,6.9
49670,49666,Ginger Snaps Snacking Cookies,61,19,5.2
49675,49671,Milk Chocolate Drops,45,19,3.0


In [142]:
df_snacks.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5


In [145]:
# 2nd way of achieving the same result with .loc
df_snacks2 = df_prods.loc[df_prods['department_id']==19]

In [146]:
df_snacks2

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5
...,...,...,...,...,...
49666,49662,Bacon Cheddar Pretzel Pieces,107,19,3.6
49669,49665,Super Dark Coconut Ash & Banana Chocolate Bar,45,19,6.9
49670,49666,Ginger Snaps Snacking Cookies,61,19,5.2
49675,49671,Milk Chocolate Drops,45,19,3.0


In [148]:
# 3nd way of achieving the same result with .loc
df_snacks3 = df_prods.loc[df_prods['department_id'].isin([19])]

In [149]:
df_snacks3

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5
...,...,...,...,...,...
49666,49662,Bacon Cheddar Pretzel Pieces,107,19,3.6
49669,49665,Super Dark Coconut Ash & Banana Chocolate Bar,45,19,6.9
49670,49666,Ginger Snaps Snacking Cookies,61,19,5.2
49675,49671,Milk Chocolate Drops,45,19,3.0


# Exporting Dataframes

In [None]:
# df_ords.to_csv(os.path.join(path, '02 Data','Prepared Data', 'orders_wrangled.csv'))

# TASK

## 01. If you haven't done so already, perform the wrangling procedures you walked through in this Exercise on your project data in a new notebook for this Exercise. Then, add a new section heading to separate your wrangling procedures from the procedures you’ll be conducting in the steps below.

#### I have done the data wrangling during the learning.

## 02. Find another identifier variable in the df_ords dataframe that doesn’t need to be included in your analysis as a numeric variable and change it to a suitable format.

In [151]:
# finding the types of each variable
df_ords.dtypes

order_id                   object
user_id                    object
order_number                int64
order_day_of_week           int64
order_hour_of_day           int64
days_since_prior_order    float64
dtype: object

In [153]:
# the numbers in 'order_id' and 'user-id' need not be included
# but I want to check if order_number is needed
df_ords.head(20)

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
5,3367565,1,6,2,7,19.0
6,550135,1,7,1,9,20.0
7,3108588,1,8,1,14,14.0
8,2295261,1,9,1,16,0.0
9,2550362,1,10,4,8,30.0


#### The columns 'Order_id' and 'User_id' are not needed for analysis, 'order_number' is important because it seems to be the order history of a particular user

## 03. Look for a variable in your df_ords dataframe with an unintuitive name and change its name without overwriting the data frame.

In [160]:
# Changing 'order_number' to 'order_number_history'
df_ords.rename (columns = {'order_number' : 'order_number_history'}, inplace = True)

In [161]:
# checking if it worked
df_ords

Unnamed: 0,order_id,user_id,order_number_history,order_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
...,...,...,...,...,...,...
3421078,2266710,206209,10,5,18,29.0
3421079,1854736,206209,11,4,10,30.0
3421080,626363,206209,12,1,12,18.0
3421081,2977660,206209,13,1,12,7.0


## 04. Your client wants to know what the busiest hour is for placing orders. Find the frequency of the corresponding variable and share your findings.

In [167]:
# finding frequency of 'order_hour_of_day'
df_ords['order_hour_of_day'].value_counts (dropna = False)

10    288418
11    284728
15    283639
14    283042
13    277999
12    272841
16    272553
9     257812
17    228795
18    182912
8     178201
19    140569
20    104292
7      91868
21     78109
22     61468
23     40043
6      30529
0      22758
1      12398
5       9569
2       7539
4       5527
3       5474
Name: order_hour_of_day, dtype: int64

#### The busiest hour of the day for ordering food is 10 am.

## 05. Determine the meaning behind a value of 4 in the "department_id" column within the df_prods dataframe using a data dictionary.

In [171]:
# the data dictionary of 'departments.csv' is called data_dict
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

#### The value '4' in the 'department_id' data frame is 'produce.'

## 06. The sales team in your client’s organization wants to know more about breakfast item sales. Create a subset containing only the required information.

In [190]:
# 'breakfast' has the deparment_id 14. I am creating a subset of 
# the dataframe df_prods with the relevant data and == function
df_breakfast = df_prods [df_prods['department_id']==14]

In [191]:
# checking if it worked
df_breakfast

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
27,28,Wheat Chex Cereal,121,14,10.1
33,34,,121,14,12.2
67,68,"Pancake Mix, Buttermilk",130,14,13.7
89,90,Smorz Cereal,121,14,3.9
210,211,Gluten Free Organic Cereal Coconut Maple Vanilla,130,14,3.6
...,...,...,...,...,...
49330,49326,Cereal Variety Fun Pack,121,14,9.1
49395,49391,Light and Fluffy Buttermilk Pancake Mix,130,14,2.0
49547,49543,Chocolate Cheerios Cereal,121,14,10.8
49637,49633,Shake 'N Pour Buttermilk Pancake Mix,130,14,14.2


## 07. They’d also like to see details about customers who might be throwing dinner parties. Your task is to find all observations from the entire dataframe that include items from the following departments: alcohol, deli, beverages, and meat/seafood. You’ll need to present this subset to your client.

In [193]:
# alcohol = 5, deli = 20, beverages = 7, meat/seafood = 13
# Creating a subset with .loc and isin functions
df_dinner_party = df_prods.loc[df_prods['department_id'].isin([5,20,7,13])]

In [196]:
df_dinner_party

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
4,5,Green Chile Anytime Sauce,5,13,4.3
6,7,Pure Coconut Water With Orange,98,7,4.4
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,8.4
...,...,...,...,...,...
49679,49675,Cinnamon Dolce Keurig Brewed K Cups,26,7,14.0
49680,49676,Ultra Red Energy Drink,64,7,14.5
49684,49680,All Natural Creamy Caesar Dressing,89,13,4.9
49686,49682,California Limeade,98,7,4.3


## 08. It’s important that you keep track of total counts in your dataframes. How many rows does the last dataframe you created have?

In [198]:
df_dinner_party

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
4,5,Green Chile Anytime Sauce,5,13,4.3
6,7,Pure Coconut Water With Orange,98,7,4.4
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,8.4
...,...,...,...,...,...
49679,49675,Cinnamon Dolce Keurig Brewed K Cups,26,7,14.0
49680,49676,Ultra Red Energy Drink,64,7,14.5
49684,49680,All Natural Creamy Caesar Dressing,89,13,4.9
49686,49682,California Limeade,98,7,4.3


#### My last created data frame, 'df_dinner_party', has 12114 rows.

## 09. Someone from the data engineers team in Instacart thinks they’ve spotted something strange about the customer with a "user_id" of “1.” Extract all the information you can about this user.

In [210]:
# finding all information of user_id 1
df_user_id_1 = df_ords[df_ords['user_id']==1]

In [211]:
df_user_id_1

Unnamed: 0,order_id,user_id,order_number_history,order_day_of_week,order_hour_of_day,days_since_prior_order


In [212]:
# it did not work, probably because user_id is an object and not a number
# changing the type of user_id back to integer
df_ords['user_id'] = df_ords['user_id'].astype('int')

In [214]:
df_ords.dtypes

order_id                   object
user_id                     int32
order_number_history        int64
order_day_of_week           int64
order_hour_of_day           int64
days_since_prior_order    float64
dtype: object

In [215]:
#2 finding all information of user_id 1
df_user_id_1 = df_ords.loc[df_ords['user_id']==1]

In [216]:
df_user_id_1

Unnamed: 0,order_id,user_id,order_number_history,order_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
5,3367565,1,6,2,7,19.0
6,550135,1,7,1,9,20.0
7,3108588,1,8,1,14,14.0
8,2295261,1,9,1,16,0.0
9,2550362,1,10,4,8,30.0


## 10. You also need to provide some details about this user’s behavior. What basic stats can you provide based on the information you have?

In [218]:
# descriptive statistics for df_user_id_1
df_user_id_1.describe ()

Unnamed: 0,user_id,order_number_history,order_day_of_week,order_hour_of_day,days_since_prior_order
count,11.0,11.0,11.0,11.0,10.0
mean,1.0,6.0,2.636364,10.090909,19.0
std,0.0,3.316625,1.286291,3.477198,9.030811
min,1.0,1.0,1.0,7.0,0.0
25%,1.0,3.5,1.5,7.5,14.25
50%,1.0,6.0,3.0,8.0,19.5
75%,1.0,8.5,4.0,13.0,26.25
max,1.0,11.0,4.0,16.0,30.0


#### The user with id 1 completed 11 orders on 10 different days
#### The mean of: 
#### order_day_of_week is between Monday and Tuesday (2,636)
#### order_hour_of_day lies between 10 and 11 am (10,09)
#### days_since_prior_order is 19 days
#### The maximum of:
#### order_day_of_week is Wednesday (4)
#### order_hour_of_day lies between 16 am
#### days_since_prior_order is 30 days
#### The mimimum of:
#### order_day_of_week is Sunday (1)
#### order_hour_of_day lies between 7 am
#### days_since_prior_order is 0 days

## 11. Check the organization and structure of your notebook. Be sure to include section headings and code comments.

## 12. Export your df_ords dataframe as “orders_wrangled.csv” in your “Prepared Data” folder.

In [220]:
# Exporting df_ords
df_ords.to_csv(os.path.join(path, '02 Data','Prepared Data', 'orders_wrangled.csv'))

## 13. Export the df_dep_t_new dataframe as “departments_wrangled.csv” in your “Prepared Data” folder so that you have a “.csv” file of your departments data in the correct format.

In [221]:
# Exporting df_dep_t_new
df_dep_t_new.to_csv(os.path.join(path, '02 Data','Prepared Data', 'departments_wrangled.csv'))

## 14. Save your Jupyter notebook and submit it here for your tutor to review.