# 02 - Filtering and Sorting Data



### Step 1. Import the necessary libraries

In [1]:
import numpy as np
import pandas as pd
import re

### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv). Assign it to a variable called chipo.

This time we are going to pull data directly from the internet.

In [2]:
address = "https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv"

In [128]:
df = pd.read_csv(address, sep = '\t')
df.head(20)

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98
5,3,1,Chicken Bowl,"[Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou...",$10.98
6,3,1,Side of Chips,,$1.69
7,4,1,Steak Burrito,"[Tomatillo Red Chili Salsa, [Fajita Vegetables...",$11.75
8,4,1,Steak Soft Tacos,"[Tomatillo Green Chili Salsa, [Pinto Beans, Ch...",$9.25
9,5,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Pinto...",$9.25


### Step 3. Name of the most expensive products (item_name)

¿Recuerdas el `map`? En **pandas** tienes el método [`apply`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.apply.html) y en para resolver este ejercicio te puede ser muy útil. 

Pero no acaba ahí https://stackoverflow.com/questions/19798153/difference-between-map-applymap-and-apply-methods-in-pandas

In [106]:
df['item_price'].dtype

dtype('O')

In [129]:
# Step 1) Sacar la columna de los precios en formato float

# Como la variable no es tipo float, la transformo primero para poder sacar maximos y minimos
# Para ello saco el patrón con regex
df['item_price_2'] = re.findall('[\d]*[.][\d]*', str(list(df['item_price'].values)))
# La neuva columna la paso a numerico
df['item_price_2'] = pd.to_numeric(df['item_price_2'])
# Esto ya es tipo floar
df['item_price_2'].dtype

dtype('float64')

In [131]:
# Step 2) Calcular el precio de la unidad

df['unit_price'] = df['item_price_2'] / df['quantity']
df.head(20)

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price,item_price_2,unit_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39,2.39,2.39
1,1,1,Izze,[Clementine],$3.39,3.39,3.39
2,1,1,Nantucket Nectar,[Apple],$3.39,3.39,3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39,2.39,2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98,16.98,8.49
5,3,1,Chicken Bowl,"[Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou...",$10.98,10.98,10.98
6,3,1,Side of Chips,,$1.69,1.69,1.69
7,4,1,Steak Burrito,"[Tomatillo Red Chili Salsa, [Fajita Vegetables...",$11.75,11.75,11.75
8,4,1,Steak Soft Tacos,"[Tomatillo Green Chili Salsa, [Pinto Beans, Ch...",$9.25,9.25,9.25
9,5,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Pinto...",$9.25,9.25,9.25


In [132]:
# Saco un nuevo df de la lista de productos agrupados por nombres (porque los hay repetidos) y de esos, que me devuelva el máximo valor entre los repetidos
ordered_items = df.groupby('item_name').agg({'unit_price' : max}).sort_values(by = 'unit_price', ascending = False)
# print(ordered_items)

# Saco el precio máximo de la anterior selección
top_price= ordered_items.apply(max)[0]
#print(top_price)

# Que me devuelva la lista de ordered_items, donde el precio sea igual al precio máximo. De esta manera obtenemos el item con el mayor precio
ordered_items[ordered_items == top_price].dropna()

Unnamed: 0_level_0,unit_price
item_name,Unnamed: 1_level_1
Steak Salad Bowl,11.89
Carnitas Salad Bowl,11.89
Barbacoa Salad Bowl,11.89


### Step 4. How many products cost more than $10.00?

In [133]:
df[df['unit_price'] > 10]

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price,item_price_2,unit_price
5,3,1,Chicken Bowl,"[Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou...",$10.98,10.98,10.98
7,4,1,Steak Burrito,"[Tomatillo Red Chili Salsa, [Fajita Vegetables...",$11.75,11.75,11.75
13,7,1,Chicken Bowl,"[Fresh Tomato Salsa, [Fajita Vegetables, Rice,...",$11.25,11.25,11.25
23,12,1,Chicken Burrito,"[[Tomatillo-Green Chili Salsa (Medium), Tomati...",$10.98,10.98,10.98
39,19,1,Barbacoa Bowl,"[Roasted Chili Corn Salsa, [Fajita Vegetables,...",$11.75,11.75,11.75
...,...,...,...,...,...,...,...
4610,1830,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Sour Cream, Cheese...",$11.75,11.75,11.75
4611,1830,1,Veggie Burrito,"[Tomatillo Green Chili Salsa, [Rice, Fajita Ve...",$11.25,11.25,11.25
4617,1833,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Sour ...",$11.75,11.75,11.75
4618,1833,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Sour Cream, Cheese...",$11.75,11.75,11.75


### Step 5. What is the price of each item? 
###### print a data frame with only two columns item_name and item_price

In [134]:
df[['item_name', 'unit_price']]

Unnamed: 0,item_name,unit_price
0,Chips and Fresh Tomato Salsa,2.39
1,Izze,3.39
2,Nantucket Nectar,3.39
3,Chips and Tomatillo-Green Chili Salsa,2.39
4,Chicken Bowl,8.49
...,...,...
4617,Steak Burrito,11.75
4618,Steak Burrito,11.75
4619,Chicken Salad Bowl,11.25
4620,Chicken Salad Bowl,8.75


### Step 6. Sort by the name of the item

In [135]:
df.sort_values('item_name')

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price,item_price_2,unit_price
3389,1360,2,6 Pack Soft Drink,[Diet Coke],$12.98,12.98,6.49
341,148,1,6 Pack Soft Drink,[Diet Coke],$6.49,6.49,6.49
1849,749,1,6 Pack Soft Drink,[Coke],$6.49,6.49,6.49
1860,754,1,6 Pack Soft Drink,[Diet Coke],$6.49,6.49,6.49
2713,1076,1,6 Pack Soft Drink,[Coke],$6.49,6.49,6.49
...,...,...,...,...,...,...,...
2384,948,1,Veggie Soft Tacos,"[Roasted Chili Corn Salsa, [Fajita Vegetables,...",$8.75,8.75,8.75
781,322,1,Veggie Soft Tacos,"[Fresh Tomato Salsa, [Black Beans, Cheese, Sou...",$8.75,8.75,8.75
2851,1132,1,Veggie Soft Tacos,"[Roasted Chili Corn Salsa (Medium), [Black Bea...",$8.49,8.49,8.49
1699,688,1,Veggie Soft Tacos,"[Fresh Tomato Salsa, [Fajita Vegetables, Rice,...",$11.25,11.25,11.25


### Step 7. What was the quantity of the most expensive item ordered? 2 ways

V1

In [138]:
# To check
# df.sort_values('item_price_2', ascending = False)

df[df['unit_price'] == 11.89].groupby('item_name').agg({'quantity' : sum})

Unnamed: 0_level_0,quantity
item_name,Unnamed: 1_level_1
Barbacoa Salad Bowl,5
Carnitas Salad Bowl,4
Steak Salad Bowl,21


V2

In [127]:
# La verdad, no sé jajajaja

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price,item_price_2
0,1,1,Chips and Fresh Tomato Salsa,,$2.39,2.39
25,13,1,Chips and Fresh Tomato Salsa,,$2.39,2.39
55,25,1,Chips and Fresh Tomato Salsa,,$2.39,2.39
89,39,1,Chips and Fresh Tomato Salsa,,$2.95,2.95
183,82,1,Chips and Fresh Tomato Salsa,,$2.95,2.95
...,...,...,...,...,...,...
4231,1689,1,Chips and Fresh Tomato Salsa,,$2.95,2.95
4318,1722,1,Chips and Fresh Tomato Salsa,,$2.95,2.95
4324,1725,1,Chips and Fresh Tomato Salsa,,$2.95,2.95
4425,1764,1,Chips and Fresh Tomato Salsa,,$2.95,2.95


### Step 8. How many times was a Veggie Salad Bowl ordered?

In [144]:
df.groupby('item_name').agg({'quantity' : sum}).loc['Veggie Salad Bowl']

quantity    18
Name: Veggie Salad Bowl, dtype: int64

### Step 9. How many times did someone order more than one Canned Soda?

In [146]:
df[(df['item_name'] == 'Canned Soda') & (df['quantity'] > 1)]

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price,item_price_2,unit_price
18,9,2,Canned Soda,[Sprite],$2.18,2.18,1.09
51,23,2,Canned Soda,[Mountain Dew],$2.18,2.18,1.09
162,73,2,Canned Soda,[Diet Coke],$2.18,2.18,1.09
171,76,2,Canned Soda,[Diet Dr. Pepper],$2.18,2.18,1.09
350,150,2,Canned Soda,[Diet Coke],$2.18,2.18,1.09
352,151,2,Canned Soda,[Coca Cola],$2.18,2.18,1.09
698,287,2,Canned Soda,[Coca Cola],$2.18,2.18,1.09
700,288,2,Canned Soda,[Coca Cola],$2.18,2.18,1.09
909,376,2,Canned Soda,[Mountain Dew],$2.18,2.18,1.09
1091,450,2,Canned Soda,[Dr. Pepper],$2.18,2.18,1.09
