# DATA WRANGLING & SUBSETTING

### This script contains the following:
#### 1. Importing libraries
#### 2. Importing data
#### 3. Wrangling data
#### 4. Data dictionary
#### 5. Subsetting
#### 6. Exporting dataframes
#### *. TASK

# --------------------------------------------------------------------------------------------------------------

## 1. Importing libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

## 2. Importing data

In [2]:
# We create a path to add a shortcut to our files location
path = r'C:\Users\javis\OneDrive\Documentos\Career Foundry\Data Immersion\Python\Instacart Basket Analysis'

In [5]:
# We then give Python the instruction of reading the file located in that path
df_ords = pd.read_csv(os.path.join(path, '2. Data', '2.1. Original Data', 'orders.csv'), index_col = False)
df_prods = pd.read_csv(os.path.join(path, '2. Data', '2.1. Original Data', 'products.csv'), index_col = False)

## 3. Wrangling data

## 3.1. Dropping columns

In [6]:
# Firstly we will get rid of the columns that we don't need. In this case, we will drop "eval_set" from orders.csv
df_ords.drop(columns = ['eval_set'])

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
...,...,...,...,...,...,...
3421078,2266710,206209,10,5,18,29.0
3421079,1854736,206209,11,4,10,30.0
3421080,626363,206209,12,1,12,18.0
3421081,2977660,206209,13,1,12,7.0


In [7]:
# Then we overwrite the previous dataframe with the new one, without the unwanted column.
df_ords = df_ords.drop(columns = ['eval_set'])

In [8]:
# We can look for missing values with the value_counts function
df_ords ['days_since_prior_order'].value_counts(dropna = False)

30.0    369323
7.0     320608
6.0     240013
4.0     221696
3.0     217005
5.0     214503
NaN     206209
2.0     193206
8.0     181717
1.0     145247
9.0     118188
14.0    100230
10.0     95186
13.0     83214
11.0     80970
12.0     76146
0.0      67755
15.0     66579
16.0     46941
21.0     45470
17.0     39245
20.0     38527
18.0     35881
19.0     34384
22.0     32012
28.0     26777
23.0     23885
27.0     22013
24.0     20712
25.0     19234
29.0     19191
26.0     19016
Name: days_since_prior_order, dtype: int64

## 3.2. Renaming columns

In [9]:
# We can change column names to make them more representative.
df_ords.rename(columns = {'order_dow' : 'orders_day_of_week'}, inplace = True)

In [11]:
# We check if function worked as desired
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


## 3.3. Changing a variable's data type

In [12]:
# We can change the id variables to strings as we don't need statistical analysis on them.
df_ords['order_id'] = df_ords['order_id'].astype('str')

In [13]:
# We can double check that the change took place with dtype function. "0" means object/string 
df_ords ['order_id'].dtype

dtype('O')

## 3.4. Transposing data

In [16]:
# We are going to start importing a new dataframe, departments
df_dep = pd.read_csv(os.path.join(path, '2. Data', '2.1. Original Data', 'departments.csv'), index_col = False)

In [17]:
df_dep.head()

Unnamed: 0,department_id,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,department,frozen,other,bakery,produce,alcohol,international,beverages,pets,dry goods pasta,...,meat seafood,pantry,breakfast,canned goods,dairy eggs,household,babies,snacks,deli,missing


In [18]:
# To transpose it, the function is very simple.
df_dep.T

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [20]:
# Now we will create a new version of this vertical dataframe
df_dep_t = df_dep.T

In [21]:
# Let's see what it looks like
df_dep_t

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


## 3.4.1. Creating an index 

In [22]:
# We then create an index column with the rows' numbers.
df_dep_t.reset_index()

Unnamed: 0,index,0
0,department_id,department
1,1,frozen
2,2,other
3,3,bakery
4,4,produce
5,5,alcohol
6,6,international
7,7,beverages
8,8,pets
9,9,dry goods pasta


## 3.4.2. Creating a new header

In [23]:
# To create a new header, we need to follow 3 steps

In [27]:
# Step 1. Create the the header
# Firstly we take the first row of df_dep_t for the header
new_header = df_dep_t.iloc[0]

In [28]:
new_header 

0    department
Name: department_id, dtype: object

In [31]:
# We t Step 2. Get rid of old header
# Create a new dataframe with the info from row 1 onwards
df_dep_t_new = df_dep_t[1:]

In [30]:
df_dep_t_new

Unnamed: 0,0
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


In [32]:
# Step 3. We add the new header
# We use the list of our new header for the name of the columns
df_dep_t_new.columns = new_header

In [34]:
# This is how our new dataframe looks like now
df_dep_t_new

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


# 4. Creating a Data Dictionary

In [37]:
# We create the variable from our dataframe, after wrangling it to make it suitable to our needs.
data_dict = df_dep_t_new.to_dict('index')

In [38]:
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [39]:
# To put it in use, we will bring the products file again into show
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [40]:
# To find out which department are these products, we would use the following command
print(data_dict.get('19'))

{'department': 'snacks'}


In [47]:
# For the rest of products in our head list
print(data_dict.get('13'))
print(data_dict.get('7'))
print(data_dict.get('1'))

{'department': 'pantry'}
{'department': 'beverages'}
{'department': 'frozen'}


# 5. Subsetting

In [48]:
# To create a subset of snacks
df_snacks =  df_prods[df_prods['department_id']==19]

In [53]:
df_snacks.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5


In [51]:
# You could also use this command with the function "loc" 
df_snacks_2 = df_prods.loc[df_prods['department_id'] == 19]

In [54]:
df_snacks_2.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5


In [55]:
# There is one more way, with the function "isin([])"
df_snacks_3 = df_prods.loc[df_prods['department_id'].isin([19])]

In [56]:
df_snacks_3.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5


# 6. Exporting dataframes

In [None]:
# At the end of every notebook, we should be exporting our data to keep it safe and usable in other analysis.
# The command would take the following shape--> df_ords.to_csv(os.path.join(path, '2. Data','Prepared Data', 'orders_wrangled.csv'))

## --------------------------------------------------------------------------------------------------------------------------------

#                                                           TASK  

## 2. Find and make suitable a variable that doesn't need to be in numeric format 

In [63]:
# Firstly we represent the table - please note name change for next task point has already been applied in this representation
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_from_last_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


In [57]:
# User_id is a number to identify the customers. No statistical significance.
df_ords ['user_id'] = df_ords ['user_id'].astype('str')

In [58]:
# Order_number is also a way of identifying the different orders. No statistical significance.
df_ords ['order_number'] = df_ords ['order_number'].astype ('str')

## 3. Change the name of a variable with no intuitive name

In [61]:
# The last column seems to have a name a little bit confusing, I will change it to something more intuitive.
df_ords.rename(columns = {'days_since_prior_order' : 'days_from_last_order'}, inplace = True)

In [62]:
# Now we check the result applied...
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_from_last_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


## 4. Find the busiest time for placing orders

In [67]:
# To calculate the average we use the function mean
df_ords['order_hour_of_day'].mean()

13.45201534134074

In [69]:
# The average of 13.45 tell us that the busiest hour for orders is at some point between 1.25pm and 1.30pm

## 5. Determine the value of department_id number 4 with a Data Dictionary

In [74]:
# Firstly we represent the dictionary we created before, it will come handy the next exercises
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [85]:
# To find out the name of each department, in this case number 4, we would use the following command
print(data_dict.get('4'))

{'department': 'produce'}


## 6. Create a subset of breakfast items for the sales department

In [81]:
# To create a subset of breakfast items, I checked in the dictionary which number of department id refers to breakfast (14)
df_breakfast = df_prods.loc [df_prods['department_id']==14]

In [82]:
# We show a representation...
df_breakfast.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
27,28,Wheat Chex Cereal,121,14,10.1
33,34,,121,14,12.2
67,68,"Pancake Mix, Buttermilk",130,14,13.7
89,90,Smorz Cereal,121,14,3.9
210,211,Gluten Free Organic Cereal Coconut Maple Vanilla,130,14,3.6


## 7. Create a subset with items suitable for dinner parties

In [77]:
# We want to create a dataframe representing alcohol, beverages, deli and meat&seafood
df_dinner_parties = df_prods.loc [df_prods['department_id'].isin([5,7,12,20])]

In [83]:
# Let's have a look at our new dataframe
df_dinner_parties

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
6,7,Pure Coconut Water With Orange,98,7,4.4
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,8.4
10,11,Peach Mango Juice,31,7,2.8
16,17,Rendered Duck Fat,35,12,17.1
...,...,...,...,...,...
49676,49672,Cafe Mocha K-Cup Packs,26,7,6.5
49679,49675,Cinnamon Dolce Keurig Brewed K Cups,26,7,14.0
49680,49676,Ultra Red Energy Drink,64,7,14.5
49686,49682,California Limeade,98,7,4.3


## 8. Track the count of rows of the last dataframe

In [84]:
# As we can see two lines above, the dinner_parties df has 7650 rows

## 9. Extract all the information you can from the user 1

In [99]:
# In other words, we have been asked to create a subset for user_id 1. We must put '1' because we changed the data type to string.
df_user_1 = df_ords.loc[df_ords['user_id'].isin(['1'])]

In [100]:
df_user_1

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_from_last_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
5,3367565,1,6,2,7,19.0
6,550135,1,7,1,9,20.0
7,3108588,1,8,1,14,14.0
8,2295261,1,9,1,16,0.0
9,2550362,1,10,4,8,30.0


In [103]:
# I would also show the statistical analysis, as well as some basic info of the dataset
df_user_1.describe()

Unnamed: 0,orders_day_of_week,order_hour_of_day,days_from_last_order
count,11.0,11.0,10.0
mean,2.636364,10.090909,19.0
std,1.286291,3.477198,9.030811
min,1.0,7.0,0.0
25%,1.5,7.5,14.25
50%,3.0,8.0,19.5
75%,4.0,13.0,26.25
max,4.0,16.0,30.0


In [102]:
# The info doesn't tell us much
df_user_1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11 entries, 0 to 10
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   order_id              11 non-null     object 
 1   user_id               11 non-null     object 
 2   order_number          11 non-null     object 
 3   orders_day_of_week    11 non-null     int64  
 4   order_hour_of_day     11 non-null     int64  
 5   days_from_last_order  10 non-null     float64
dtypes: float64(1), int64(2), object(3)
memory usage: 616.0+ bytes


## 10. Provide some details about this customer's behaviour

In [105]:
# From the statistical analysis we can throw some findings.
  # Customer has order 11 times, always between Monday and Thursday (min=1, max=4)
  # He always orders between 7am and 4pm, being around 10am o'clock the average time
  # He orders every 19 days as an average. However, in one ocassion he ordered the same day twice, with two hours
  # of difference, and after that he was 30 days without ordering at all, his longest gap.

## 12. Export your data frames

In [107]:
# Firstly we will export df_ords
df_ords.to_csv(os.path.join(path, '2. Data','2.2. Prepared Data', 'orders_wrangled.csv'))

In [108]:
# Then we export the one we wrangled for the departments
df_dep_t_new.to_csv(os.path.join(path, '2. Data', '2.2. Prepared Data', 'departments_wrangled.csv'))