# Setup and Spark Basics

Ing. Jeison Robles Arias

```Goal```: learn transformations vs actions, schema, explain plans, partitions, cache, basic I/O.

In [None]:
import spark
import pyspark

In [None]:
# Confirm working

spark.version

## Import Helpers

In [None]:
from pyspark.sql import functions as F
from pyspark.sql import types as T

## Create an Small DataFrame with explicit schema

In [None]:
data = [
    (1, "alice", "CR", 120.50),
    (2, "bob", "CR", 75.00),
    (3, "carol", "PA", 210.10),
    (4, "dave", "CR", 10.00),
    (5, "erin", "PA", 99.99),
]

schema = T.StructType([
    T.StructField("customer_id", T.IntegerType(), False),
    T.StructField("name", T.StringType(), True),
    T.StructField("country", T.StringType(), True),
    T.StructField("amount", T.DoubleType(), True),
])

df = spark.createDataFrame(data, schema=schema)
df


## Transformations vs Actions 

In [None]:
df_cr = df.filter(F.col("country") == "CR").select("customer_id","amount")
df_cr

## Trigger an action (runs a Spark Job)

In [None]:
df_cr.count()

## Common DataFrame Operations

with column + select + orderBy

In [None]:
df2 = (
    df
    .withColumn("amount_usd", F.round(F.col("amount"), 2)) # Creates a new column
    .withColumn("is_high", F.col("amount")  >= F.lit(100.0)) # Booolean
    .select("customer_id","country","amount_usd","is_high")
    .orderBy(F.desc("amount_usd"))
)
display(df2)