# Getting Started with PySpark

This notebook demonstrates connecting to the local Spark cluster and performing basic operations.

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("GettingStarted") \
    .master("spark://localhost:7077") \
    .getOrCreate()

print(f"Spark version: {spark.version}")

## Create a DataFrame

In [None]:
data = [
    ("Alice", "Engineering", 75000),
    ("Bob", "Engineering", 80000),
    ("Charlie", "Sales", 65000),
    ("Diana", "Sales", 70000),
    ("Eve", "Marketing", 60000),
]

df = spark.createDataFrame(data, ["name", "department", "salary"])
df.show()

## Basic Operations

In [None]:
# Filter rows
df.filter(df.salary > 65000).show()

In [None]:
# Select columns
df.select("name", "salary").show()

In [None]:
# Group by and aggregate
df.groupBy("department").avg("salary").show()

## Cleanup

In [None]:
spark.stop()