# Ex2 - Getting and Knowing your Data

This time we are going to pull data directly from the internet.
Special thanks to: https://github.com/justmarkham for sharing the dataset and materials.

### Step 1. Import the necessary libraries

In [103]:
import os
print(os.environ.get("JAVA_HOME"))

/usr/lib/jvm/java-21-openjdk-amd64/


In [104]:
import pandas as pd

from pyspark.sql import SparkSession


try:
    spark = SparkSession.builder.appName("TestApp").getOrCreate()
    print("✅ SparkSession creada exitosamente!")
    print(f"Versión de Spark: {spark.version}")
    sc = spark.sparkContext
    print(f"SparkContext disponible: {sc}")
except Exception as e:
    print("❌ Error al crear SparkSession:")
    print(e)


✅ SparkSession creada exitosamente!
Versión de Spark: 3.5.5
SparkContext disponible: <SparkContext master=local[*] appName=TestApp>


### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv). 

### Step 3. Assign it to a variable called chipo.

In [105]:
import requests

# link
url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv'
local_file = 'chipotle.tsv'

r = requests.get(url)

# Guardar como texto (r.text) en modo texto
with open(local_file, 'w', encoding='utf-8') as f:
    f.write(r.text)


chipote = spark.read.csv(local_file, header=True, inferSchema=True, sep='\t')


### Step 4. See the first 10 entries

In [106]:
primeros_10 = chipote.show(10)
primeros_10

+--------+--------+--------------------+--------------------+----------+
|order_id|quantity|           item_name|  choice_description|item_price|
+--------+--------+--------------------+--------------------+----------+
|       1|       1|Chips and Fresh T...|                NULL|    $2.39 |
|       1|       1|                Izze|        [Clementine]|    $3.39 |
|       1|       1|    Nantucket Nectar|             [Apple]|    $3.39 |
|       1|       1|Chips and Tomatil...|                NULL|    $2.39 |
|       2|       2|        Chicken Bowl|[Tomatillo-Red Ch...|   $16.98 |
|       3|       1|        Chicken Bowl|[Fresh Tomato Sal...|   $10.98 |
|       3|       1|       Side of Chips|                NULL|    $1.69 |
|       4|       1|       Steak Burrito|[Tomatillo Red Ch...|   $11.75 |
|       4|       1|    Steak Soft Tacos|[Tomatillo Green ...|    $9.25 |
|       5|       1|       Steak Burrito|[Fresh Tomato Sal...|    $9.25 |
+--------+--------+--------------------+-----------

### Step 5. What is the number of observations in the dataset?

In [107]:
# Solution 1

cuenta_observaciones = chipote.count()

print('El total de observaciones es de %s'% cuenta_observaciones)

El total de observaciones es de 4622


### Step 6. What is the number of columns in the dataset?

In [108]:
numero_cols = len(chipote.columns)
print('El total de columnas es de %s'% numero_cols)


El total de columnas es de 5


### Step 7. Print the name of all the columns.

In [109]:
' -- '.join([f"Col {i+1}: {col}" for i, col in enumerate(chipote.columns)])


'Col 1: order_id -- Col 2: quantity -- Col 3: item_name -- Col 4: choice_description -- Col 5: item_price'

### Step 8. How is the dataset indexed?

In [110]:
# Los dataframes de pyspark no se encuentran indexados como en pandas ya que estos se distribuyen en sobre RDDS,

# ESto seria desde el 0 en adelante
from pyspark.sql.functions import monotonically_increasing_id

df_index = chipote.withColumn('indice', monotonically_increasing_id())  

df_index.show(5)

# ESto seria desde el 1 en adelante

from pyspark.sql.functions import row_number
from pyspark.sql.window import Window

df_indexed = chipote.withColumn("index", row_number().over(Window.orderBy("order_id")))
df_indexed.show(5)

+--------+--------+--------------------+--------------------+----------+------+
|order_id|quantity|           item_name|  choice_description|item_price|indice|
+--------+--------+--------------------+--------------------+----------+------+
|       1|       1|Chips and Fresh T...|                NULL|    $2.39 |     0|
|       1|       1|                Izze|        [Clementine]|    $3.39 |     1|
|       1|       1|    Nantucket Nectar|             [Apple]|    $3.39 |     2|
|       1|       1|Chips and Tomatil...|                NULL|    $2.39 |     3|
|       2|       2|        Chicken Bowl|[Tomatillo-Red Ch...|   $16.98 |     4|
+--------+--------+--------------------+--------------------+----------+------+
only showing top 5 rows

+--------+--------+--------------------+--------------------+----------+-----+
|order_id|quantity|           item_name|  choice_description|item_price|index|
+--------+--------+--------------------+--------------------+----------+-----+
|       1|       1

25/05/26 21:32:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/26 21:32:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/26 21:32:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


### Step 9. Which was the most-ordered item? 

In [111]:
# 1era forma
item_mas_ordenado = chipote.groupBy("item_name").count().orderBy('count', ascending=False).show(1)

# 2da forma
from pyspark.sql.functions import count

item_mas_ordenado_form_2 = chipote.groupBy('item_name') \
       .agg(count('*').alias('cantidad_total_pedida')) \
       .orderBy("cantidad_total_pedida", ascending=False) \
       .show(1)


+------------+-----+
|   item_name|count|
+------------+-----+
|Chicken Bowl|  726|
+------------+-----+
only showing top 1 row

+------------+---------------------+
|   item_name|cantidad_total_pedida|
+------------+---------------------+
|Chicken Bowl|                  726|
+------------+---------------------+
only showing top 1 row



### Step 10. For the most-ordered item, how many items were ordered?

In [112]:
chipote.show()

+--------+--------+--------------------+--------------------+----------+
|order_id|quantity|           item_name|  choice_description|item_price|
+--------+--------+--------------------+--------------------+----------+
|       1|       1|Chips and Fresh T...|                NULL|    $2.39 |
|       1|       1|                Izze|        [Clementine]|    $3.39 |
|       1|       1|    Nantucket Nectar|             [Apple]|    $3.39 |
|       1|       1|Chips and Tomatil...|                NULL|    $2.39 |
|       2|       2|        Chicken Bowl|[Tomatillo-Red Ch...|   $16.98 |
|       3|       1|        Chicken Bowl|[Fresh Tomato Sal...|   $10.98 |
|       3|       1|       Side of Chips|                NULL|    $1.69 |
|       4|       1|       Steak Burrito|[Tomatillo Red Ch...|   $11.75 |
|       4|       1|    Steak Soft Tacos|[Tomatillo Green ...|    $9.25 |
|       5|       1|       Steak Burrito|[Fresh Tomato Sal...|    $9.25 |
|       5|       1| Chips and Guacamole|           

In [113]:
from pyspark.sql.functions import col, sum, countDistinct

item_mas_ordenado_df = chipote.groupBy("item_name").count().orderBy('count', ascending=False)

item_mas_ordenado = item_mas_ordenado_df.select('item_name').first()['item_name']



chipote.filter(col('item_name') == item_mas_ordenado) \
       .agg(sum("quantity").alias("cantidad_veces_ordenado")) \
       .show()

+-----------------------+
|cantidad_veces_ordenado|
+-----------------------+
|                    761|
+-----------------------+



### Step 11. What was the most ordered item in the choice_description column?

In [114]:
from collections import Counter
from pyspark.sql import functions as F

chipote.select("choice_description") \
       .show(20, truncate=False)

+------------------------------------------------------------------------------------------------------------------+
|choice_description                                                                                                |
+------------------------------------------------------------------------------------------------------------------+
|NULL                                                                                                              |
|[Clementine]                                                                                                      |
|[Apple]                                                                                                           |
|NULL                                                                                                              |
|[Tomatillo-Red Chili Salsa (Hot), [Black Beans, Rice, Cheese, Sour Cream]]                                        |
|[Fresh Tomato Salsa (Mild), [Rice, Cheese, Sour Cream, Guacamol

### Step 12. How many items were orderd in total?

In [115]:
from pyspark.sql.functions import udf, explode, col
from pyspark.sql.types import ArrayType, StringType
import pyspark.sql.functions as F

def parse_bracket_list(s: str):
    if not s or len(s) < 2 or s[0] != "[" or s[-1] != "]":
        # si no está bien bracketed, devolvemos la cadena tal cual (o vacío si era None)
        return [s.strip()] if s else []
    inner = s[1:-1]  # quito corchetes exteriores
    items = []
    buff = ""
    depth = 0
    for ch in inner:
        if ch == "[":
            depth += 1
            buff += ch
        elif ch == "]":
            depth -= 1
            buff += ch
        elif ch == "," and depth == 0:
            # coma separadora a nivel top
            items.append(buff.strip())
            buff = ""
        else:
            buff += ch
    if buff:
        items.append(buff.strip())

    # ahora aplano recursivamente
    flat = []
    for it in items:
        if it.startswith("[") and it.endswith("]"):
            flat.extend(parse_bracket_list(it))
        else:
            flat.append(it)
    return flat

# registramos el UDF
parse_udf = udf(parse_bracket_list, ArrayType(StringType()))

# 1. Creamos columna de arrays parseados
df0 = chipote.withColumn("items", parse_udf(col("choice_description")))

# 2. Explode y contamos
freq_df = (
    df0
    .withColumn("item", explode(col("items")))
    .groupBy("item")
    .agg(F.count("*").alias("count"))
    .orderBy(col("count").desc())
)

freq_df.show(truncate=False)


+------------------------------------+-----+
|item                                |count|
+------------------------------------+-----+
|Rice                                |2389 |
|Cheese                              |2281 |
|Lettuce                             |1742 |
|Sour Cream                          |1711 |
|Black Beans                         |1342 |
|NULL                                |1246 |
|Fresh Tomato Salsa                  |1046 |
|Guacamole                           |1037 |
|Fajita Vegetables                   |722  |
|Pinto Beans                         |582  |
|Roasted Chili Corn Salsa            |457  |
|Fresh Tomato Salsa (Mild)           |351  |
|Tomatillo Red Chili Salsa           |325  |
|Fajita Veggies                      |302  |
|Roasted Chili Corn Salsa (Medium)   |270  |
|Tomatillo-Red Chili Salsa (Hot)     |259  |
|Tomatillo Green Chili Salsa         |230  |
|Diet Coke                           |134  |
|Tomatillo-Green Chili Salsa (Medium)|128  |
|Coke     

### Step 13. Turn the item price into a float

In [141]:
from pyspark.sql.functions import col, regexp_replace


chipote2 = chipote.withColumn('item_price',regexp_replace(
                                                        regexp_replace(col('item_price'), r'\$', ''),
                                                          r'\.',''
                                                          ).cast('float')
                                                        
                            )

#### Step 13.a. Check the item price type

In [142]:
chipote2.select('item_price').printSchema()

root
 |-- item_price: float (nullable = true)



#### Step 13.b. Create a lambda function and change the type of item price

#### Step 13.c. Check the item price type

### Step 14. How much was the revenue for the period in the dataset?

In [155]:
chipote2.select(sum(col('quantity')*col('item_price')).alias("total_Revenue")).show()

+-------------+
|total_Revenue|
+-------------+
|    3923702.0|
+-------------+



In [156]:
chipote2.agg(
    sum(col("quantity") * col("item_price"))
      .alias("total_Revenue")
).show()

+-------------+
|total_Revenue|
+-------------+
|    3923702.0|
+-------------+



### Step 15. How many orders were made in the period?

### Step 16. What is the average revenue amount per order?

In [119]:
# Solution 1



In [120]:
# Solution 2



### Step 17. How many different items are sold?