# Ex2 - Getting and Knowing your Data

This time we are going to pull data directly from the internet.
Special thanks to: https://github.com/justmarkham for sharing the dataset and materials.

### Step 1. Import the necessary libraries

In [None]:
from pyspark.sql import SparkSession

In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=27608007106809dcf6b2af2cc7fa15a3b8a519b2ef950aa1d4dacecabfea7307
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [None]:
spark = SparkSession.builder.master("local[*]").getOrCreate()

### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv).

### Step 3. Assign it to a variable called chipo.

In [None]:
!wget https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv

--2024-04-04 16:24:09--  https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 364975 (356K) [text/plain]
Saving to: ‘chipotle.tsv’


2024-04-04 16:24:10 (10.3 MB/s) - ‘chipotle.tsv’ saved [364975/364975]



In [None]:
!ls

chipotle.tsv  sample_data


In [None]:
chipo = spark.read.csv("chipotle.tsv", sep='\t', header=True)

### Step 4. See the first 10 entries

In [None]:
chipo.show(10)

+--------+--------+--------------------+--------------------+----------+
|order_id|quantity|           item_name|  choice_description|item_price|
+--------+--------+--------------------+--------------------+----------+
|       1|       1|Chips and Fresh T...|                NULL|    $2.39 |
|       1|       1|                Izze|        [Clementine]|    $3.39 |
|       1|       1|    Nantucket Nectar|             [Apple]|    $3.39 |
|       1|       1|Chips and Tomatil...|                NULL|    $2.39 |
|       2|       2|        Chicken Bowl|[Tomatillo-Red Ch...|   $16.98 |
|       3|       1|        Chicken Bowl|[Fresh Tomato Sal...|   $10.98 |
|       3|       1|       Side of Chips|                NULL|    $1.69 |
|       4|       1|       Steak Burrito|[Tomatillo Red Ch...|   $11.75 |
|       4|       1|    Steak Soft Tacos|[Tomatillo Green ...|    $9.25 |
|       5|       1|       Steak Burrito|[Fresh Tomato Sal...|    $9.25 |
+--------+--------+--------------------+-----------

### Step 5. What is the number of observations in the dataset?

In [None]:
# Solution 1
chipo.count()


4622

In [None]:
# Solution 2
chipo.describe()


DataFrame[summary: string, order_id: string, quantity: string, item_name: string, choice_description: string, item_price: string]

### Step 6. What is the number of columns in the dataset?

In [None]:
len(chipo.columns)

5

### Step 7. Print the name of all the columns.

In [None]:
chipo.columns

['order_id', 'quantity', 'item_name', 'choice_description', 'item_price']

### Step 8. How is the dataset indexed?

AttributeError: 'DataFrame' object has no attribute 'index'

### Step 9. Which was the most-ordered item?

In [None]:
from pyspark.sql.functions import count, col, sum

In [None]:
from pyspark.sql.types import IntegerType, FloatType

In [None]:
chipo.groupBy('item_name').count().orderBy('Count', ascending=False).show()

+--------------------+-----+
|           item_name|count|
+--------------------+-----+
|        Chicken Bowl|  726|
|     Chicken Burrito|  553|
| Chips and Guacamole|  479|
|       Steak Burrito|  368|
|   Canned Soft Drink|  301|
|               Chips|  211|
|          Steak Bowl|  211|
|       Bottled Water|  162|
|  Chicken Soft Tacos|  115|
|  Chicken Salad Bowl|  110|
|Chips and Fresh T...|  110|
|         Canned Soda|  104|
|       Side of Chips|  101|
|      Veggie Burrito|   95|
|    Barbacoa Burrito|   91|
|         Veggie Bowl|   85|
|       Carnitas Bowl|   68|
|       Barbacoa Bowl|   66|
|    Carnitas Burrito|   59|
|    Steak Soft Tacos|   55|
+--------------------+-----+
only showing top 20 rows



### Step 10. For the most-ordered item, how many items were ordered?

In [None]:
chipo.groupBy('item_name').agg(count('quantity').alias('orders')).orderBy('orders', ascending=False).show(1)



In [None]:
chipo=chipo.withColumn('quantity', col('quantity').cast(IntegerType()))

In [None]:
chipo.filter(col('item_name') == 'Chicken Bowl').groupBy().sum('quantity').show()

+-------------+
|sum(quantity)|
+-------------+
|          761|
+-------------+



### Step 11. What was the most ordered item in the choice_description column?

In [None]:
chipo.filter(col('choice_description').isNotNull()).groupBy(col('choice_description')).sum().orderBy('sum(quantity)', ascending=False).show(10, truncate=False)

+---------------------------------------------------------------------------------+-------------+
|choice_description                                                               |sum(quantity)|
+---------------------------------------------------------------------------------+-------------+
|NULL                                                                             |1382         |
|[Diet Coke]                                                                      |159          |
|[Coke]                                                                           |143          |
|[Sprite]                                                                         |89           |
|[Fresh Tomato Salsa, [Rice, Black Beans, Cheese, Sour Cream, Lettuce]]           |49           |
|[Fresh Tomato Salsa, [Rice, Black Beans, Cheese, Sour Cream]]                    |42           |
|[Fresh Tomato Salsa, [Rice, Black Beans, Cheese, Sour Cream, Guacamole, Lettuce]]|40           |
|[Lemonade]         

### Step 12. How many items were orderd in total?

In [None]:
chipo.select(sum(col('quantity'))).show()

+-------------+
|sum(quantity)|
+-------------+
|         4972|
+-------------+



### Step 13. Turn the item price into a float

#### Step 13.a. Check the item price type

In [None]:
chipo.dtypes

[('order_id', 'string'),
 ('quantity', 'int'),
 ('item_name', 'string'),
 ('choice_description', 'string'),
 ('item_price', 'string')]

In [None]:
chipo.show(1)

+--------+--------+--------------------+------------------+----------+
|order_id|quantity|           item_name|choice_description|item_price|
+--------+--------+--------------------+------------------+----------+
|       1|       1|Chips and Fresh T...|              NULL|    $2.39 |
+--------+--------+--------------------+------------------+----------+
only showing top 1 row



#### Step 13.b. Create a lambda function and change the type of item price

In [None]:
to_float = lambda x: FloatType(x[1:-1])


In [None]:
from pyspark.sql.functions import udf, substring, length,max, expr, mean


In [None]:
#udf_substr_float = udf(substr(), StringType())

In [None]:
chipo.show(10)

+--------+--------+--------------------+--------------------+----------+
|order_id|quantity|           item_name|  choice_description|item_price|
+--------+--------+--------------------+--------------------+----------+
|       1|       1|Chips and Fresh T...|                NULL|    $2.39 |
|       1|       1|                Izze|        [Clementine]|    $3.39 |
|       1|       1|    Nantucket Nectar|             [Apple]|    $3.39 |
|       1|       1|Chips and Tomatil...|                NULL|    $2.39 |
|       2|       2|        Chicken Bowl|[Tomatillo-Red Ch...|   $16.98 |
|       3|       1|        Chicken Bowl|[Fresh Tomato Sal...|   $10.98 |
|       3|       1|       Side of Chips|                NULL|    $1.69 |
|       4|       1|       Steak Burrito|[Tomatillo Red Ch...|   $11.75 |
|       4|       1|    Steak Soft Tacos|[Tomatillo Green ...|    $9.25 |
|       5|       1|       Steak Burrito|[Fresh Tomato Sal...|    $9.25 |
+--------+--------+--------------------+-----------

In [None]:
chipo = chipo.withColumn('item_price', expr("substring(item_price, 2, length(item_price))").cast(FloatType()))

In [None]:
chipo.schema

StructType([StructField('order_id', StringType(), True), StructField('quantity', StringType(), True), StructField('item_name', StringType(), True), StructField('choice_description', StringType(), True), StructField('item_price', FloatType(), True)])

Column<'length(item_price)'>

#### Step 13.c. Check the item price type

In [None]:
chipo.schema

StructType([StructField('order_id', StringType(), True), StructField('quantity', StringType(), True), StructField('item_name', StringType(), True), StructField('choice_description', StringType(), True), StructField('item_price', FloatType(), True)])

### Step 14. How much was the revenue for the period in the dataset?

In [None]:
revenue = chipo.select(sum(col('item_price')* col('quantity'))).collect()[0][0]
print(revenue)

39237.01973223686


### Step 15. How many orders were made in the period?

In [None]:
chipo.select(col('order_id')).distinct().count()

1834

### Step 16. What is the average revenue amount per order?

In [None]:
# Solution 1
chipo = chipo.withColumn("revenue", chipo["quantity"] * chipo["item_price"])


In [None]:
# Group by order_id and calculate the sum of revenue
order_grouped = chipo.groupBy("order_id").agg(sum("revenue").alias("total_revenue"))

# Calculate the mean revenue
mean_revenue = order_grouped.select(mean("total_revenue")).collect()[0][0]

print(f"Mean revenue per order: {mean_revenue:.2f}")

Mean revenue per order: 21.39


In [None]:
# Solution 2
chipo.groupBy('order_id').agg(sum(chipo["quantity"] * chipo["item_price"]).alias('revenue')).select(mean('revenue').alias('mean_rev')).show()


+-----------------+
|         mean_rev|
+-----------------+
|21.39423104265914|
+-----------------+



### Step 17. How many different items are sold?

In [None]:
chipo.select(col('item_name')).distinct().count()

50