# Ex1 - Filtering and Sorting Data

This time we are going to pull data directly from the internet.
Special thanks to: https://github.com/justmarkham for sharing the dataset and materials.

### Step 1. Import the necessary libraries

In [1]:
import pandas as pd
from pyspark.sql import SparkSession


spark = SparkSession.builder\
                    .appName('Chipo')\
                    .getOrCreate()

25/05/30 18:10:36 WARN Utils: Your hostname, kevin-llanos-Type1ProductConfigId resolves to a loopback address: 127.0.1.1; using 192.168.1.92 instead (on interface wlo1)
25/05/30 18:10:36 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/30 18:10:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv). 

### Step 3. Assign it to a variable called chipo.

In [2]:
import requests

url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv'

url_request = requests.get(url)

with open('data.csv', 'w', encoding='UTF-8') as f:
   f.write(url_request.text)


df = pd.read_csv('data.csv', sep='\t')

chipo = spark.read.csv('data.csv',  sep='\t', header=True)

### Step 4. How many products cost more than $10.00?

In [3]:
chipo.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- quantity: string (nullable = true)
 |-- item_name: string (nullable = true)
 |-- choice_description: string (nullable = true)
 |-- item_price: string (nullable = true)



In [4]:
from pyspark.sql.functions import col, regexp_replace


clean_chipo = chipo.withColumn('item_price', regexp_replace(col("item_price"), "\\$", "").cast('float'))

In [5]:
clean_chipo = clean_chipo.dropDuplicates(subset=['item_name','quantity','choice_description'])

In [6]:
chipo_morethan_10 = clean_chipo.filter(col('item_price') > 10)

In [7]:

chipo_morethan_10.select('item_name').distinct().count()

31

### Step 5. What is the price of each item? 
###### print a data frame with only two columns item_name and item_price

In [8]:
chipo_morethan_10.select('item_name','item_price').distinct().show()

+--------------------+----------+
|           item_name|item_price|
+--------------------+----------+
|     Chicken Burrito|      17.5|
|        Chicken Bowl|     10.98|
|Chicken Crispy Tacos|     16.98|
|      Veggie Burrito|     10.98|
|         Veggie Bowl|     11.25|
|    Barbacoa Burrito|     11.48|
|  Chicken Soft Tacos|     21.96|
|Carnitas Crispy T...|     17.98|
|   Veggie Soft Tacos|     11.25|
|   Veggie Soft Tacos|     16.98|
|       Bottled Water|      10.5|
|          Steak Bowl|     11.08|
|Barbacoa Crispy T...|     11.48|
| Barbacoa Salad Bowl|     11.89|
|       Steak Burrito|     22.96|
|    Carnitas Burrito|     11.08|
|     Chicken Burrito|      35.0|
| Carnitas Soft Tacos|     11.75|
|                Bowl|      22.2|
|    Steak Salad Bowl|     11.89|
+--------------------+----------+
only showing top 20 rows



### Step 6. Sort by the name of the item

In [9]:
chipo_morethan_10.dropDuplicates(['item_name']).select('item_name', 'item_price').orderBy("item_name", ascending=True).show()


+--------------------+----------+
|           item_name|item_price|
+--------------------+----------+
|   6 Pack Soft Drink|     12.98|
|       Barbacoa Bowl|     11.75|
|    Barbacoa Burrito|     11.75|
|Barbacoa Crispy T...|     11.75|
| Barbacoa Salad Bowl|     11.89|
| Barbacoa Soft Tacos|     11.75|
|       Bottled Water|      15.0|
|                Bowl|      22.2|
|       Carnitas Bowl|     11.75|
|    Carnitas Burrito|     11.48|
|Carnitas Crispy T...|     11.75|
| Carnitas Salad Bowl|     11.89|
| Carnitas Soft Tacos|     11.75|
|        Chicken Bowl|     11.25|
|     Chicken Burrito|     10.98|
|Chicken Crispy Tacos|     11.25|
|       Chicken Salad|     10.98|
|  Chicken Salad Bowl|      17.5|
|  Chicken Soft Tacos|     11.25|
|Chips and Fresh T...|     44.25|
+--------------------+----------+
only showing top 20 rows



In [10]:
from pyspark.sql.functions import min
ordenados = chipo_morethan_10.groupBy("item_name").agg(min('item_price').alias('item_price')).orderBy('item_name', ascending=True)
ordenados.show()

+--------------------+----------+
|           item_name|item_price|
+--------------------+----------+
|   6 Pack Soft Drink|     12.98|
|       Barbacoa Bowl|     11.48|
|    Barbacoa Burrito|     11.08|
|Barbacoa Crispy T...|     11.48|
| Barbacoa Salad Bowl|     11.89|
| Barbacoa Soft Tacos|     11.48|
|       Bottled Water|      10.5|
|                Bowl|      22.2|
|       Carnitas Bowl|     11.08|
|    Carnitas Burrito|     11.08|
|Carnitas Crispy T...|     11.75|
| Carnitas Salad Bowl|     11.89|
| Carnitas Soft Tacos|     11.75|
|        Chicken Bowl|     10.58|
|     Chicken Burrito|     10.58|
|Chicken Crispy Tacos|     10.98|
|       Chicken Salad|     10.98|
|  Chicken Salad Bowl|     11.25|
|  Chicken Soft Tacos|     10.98|
|Chips and Fresh T...|      11.8|
+--------------------+----------+
only showing top 20 rows



### Step 7. What was the quantity of the most expensive item ordered?

In [28]:
precio_maximo = clean_chipo.agg(
    max(col('item_price')).alias('precio_maximo')
).collect()[0][0]

In [31]:
from pyspark.sql.functions import max
clean_chipo.filter(col('item_price') == precio_maximo).select('quantity').show()

+--------+
|quantity|
+--------+
|      15|
+--------+



### Step 8. How many times was a Veggie Salad Bowl ordered?

In [34]:
clean_chipo.filter(col('item_name') == 'Veggie Salad Bowl').count()

16

### Step 9. How many times did someone order more than one Canned Soda?