## Working with Data

Load data from NYC taxy dataset and work with it in Spark.

### Загружаем данные

Загружаем данные в папку data

In [1]:
import urllib.request

url = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2025-11.parquet"
local_file = "./data/yellow_tripdata_2025-11.parquet"

urllib.request.urlretrieve(url, local_file)

('./data/yellow_tripdata_2025-11.parquet',
 <http.client.HTTPMessage at 0xffff86c794d0>)

### Импортируем сессию для дальнейшей работы

In [2]:
from pyspark.sql import SparkSession

### Создаем сессию

In [3]:
spark = (
    SparkSession.builder
        .master("local")
        .appName("Word Count")
        .config("spark.some.config.option", "some-value")
        .getOrCreate()
)

### Получаем данные и файла

In [60]:
# Считываем данные из parquet, для примера взял его, т.к. уже содержит схему
df = spark.read.parquet("./data/yellow_tripdata_2025-11.parquet")

# Можно читать из csv, json, text, jdbc
# df.read.csv(path, header=True, inferSchema=True)
# ...

# Также можно считать данные API и обернуть его в DataFrame. То есть Spark не будет подключаться к API, а возьмет результат.

In [61]:
# Displays the content of the DataFrame to stdout
df.show(3)

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+------------------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|Airport_fee|cbd_congestion_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+------------------+
|       7| 2025-11-01 00:13:25|  2025-11-01 00:13:25|              1|         1.68|         1|                 N|          43|    

In [8]:
# Выведим схему DataFrame
df.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- Airport_fee: double (nullable = true)
 |-- cbd_congestion_fee: double (nullable = true)



In [10]:
# Описание полей
df.describe()

DataFrame[summary: string, VendorID: string, passenger_count: string, trip_distance: string, RatecodeID: string, store_and_fwd_flag: string, PULocationID: string, DOLocationID: string, payment_type: string, fare_amount: string, extra: string, mta_tax: string, tip_amount: string, tolls_amount: string, improvement_surcharge: string, total_amount: string, congestion_surcharge: string, Airport_fee: string, cbd_congestion_fee: string]

In [14]:
# Удобный просмотр данных таблицы
df.show(1, vertical=True)

-RECORD 0------------------------------------
 VendorID              | 7                   
 tpep_pickup_datetime  | 2025-11-01 00:13:25 
 tpep_dropoff_datetime | 2025-11-01 00:13:25 
 passenger_count       | 1                   
 trip_distance         | 1.68                
 RatecodeID            | 1                   
 store_and_fwd_flag    | N                   
 PULocationID          | 43                  
 DOLocationID          | 186                 
 payment_type          | 1                   
 fare_amount           | 14.9                
 extra                 | 0.0                 
 mta_tax               | 0.5                 
 tip_amount            | 1.5                 
 tolls_amount          | 0.0                 
 improvement_surcharge | 1.0                 
 total_amount          | 22.15               
 congestion_surcharge  | 2.5                 
 Airport_fee           | 0.0                 
 cbd_congestion_fee    | 0.75                
only showing top 1 row



In [32]:
# Вывести все типы поля
for col in df.dtypes:
    print(f"{col[0]}: {col[1]}")

# Либо какой-то определенный
print("\n", dict(df.dtypes)['VendorID'])

VendorID: int
tpep_pickup_datetime: timestamp_ntz
tpep_dropoff_datetime: timestamp_ntz
passenger_count: bigint
trip_distance: double
RatecodeID: bigint
store_and_fwd_flag: string
PULocationID: int
DOLocationID: int
payment_type: bigint
fare_amount: double
extra: double
mta_tax: double
tip_amount: double
tolls_amount: double
improvement_surcharge: double
total_amount: double
congestion_surcharge: double
Airport_fee: double
cbd_congestion_fee: double

 int


In [54]:
# Можно вывести только отдельный столбец
df.select("VendorID").show(3)

# Или несколько
df.select("VendorID", "tpep_pickup_datetime").show(3)

# Или *
df.select("*").show(1, vertical=True)

+--------+
|VendorID|
+--------+
|       7|
|       2|
|       1|
+--------+
only showing top 3 rows

+--------+--------------------+
|VendorID|tpep_pickup_datetime|
+--------+--------------------+
|       7| 2025-11-01 00:13:25|
|       2| 2025-11-01 00:49:07|
|       1| 2025-11-01 00:07:19|
+--------+--------------------+
only showing top 3 rows

-RECORD 0------------------------------------
 VendorID              | 7                   
 tpep_pickup_datetime  | 2025-11-01 00:13:25 
 tpep_dropoff_datetime | 2025-11-01 00:13:25 
 passenger_count       | 1                   
 trip_distance         | 1.68                
 RatecodeID            | 1                   
 store_and_fwd_flag    | N                   
 PULocationID          | 43                  
 DOLocationID          | 186                 
 payment_type          | 1                   
 fare_amount           | 14.9                
 extra                 | 0.0                 
 mta_tax               | 0.5                 
 tip_

In [40]:
# Преобразования данных
df.select(df["VendorID"], df["total_amount"]/1000).show(5)

+--------+---------------------+
|VendorID|(total_amount / 1000)|
+--------+---------------------+
|       7|              0.02215|
|       2|              0.02494|
|       1|              0.02562|
|       2|              0.08614|
|       1|              0.04865|
+--------+---------------------+
only showing top 5 rows



In [45]:
# Работа с датами и алеасы.

from pyspark.sql.functions import date_format

df.select(
      df["VendorID"]
    , date_format(df["tpep_pickup_datetime"], 'yyyy-MM-dd').alias("pickup_date")
    , date_format(df["tpep_dropoff_datetime"], 'yyyy-MM-dd').alias("dropoff_date")
).show(5)

+--------+-----------+------------+
|VendorID|pickup_date|dropoff_date|
+--------+-----------+------------+
|       7| 2025-11-01|  2025-11-01|
|       2| 2025-11-01|  2025-11-01|
|       1| 2025-11-01|  2025-11-01|
|       2| 2025-11-01|  2025-11-01|
|       1| 2025-11-01|  2025-11-01|
+--------+-----------+------------+
only showing top 5 rows



In [62]:
# Создание новых колонок
from pyspark.sql.functions import expr

df.withColumn(
    "pickup_date", 
    date_format(df["tpep_pickup_datetime"], 'yyyy-MM-dd')
).withColumn(
    "dropoff_date",
    date_format(df["tpep_dropoff_datetime"], 'yyyy-MM-dd')
).withColumn(
    "total_amount_k",
    (expr("total_amount / 1000"))
).show(1, vertical=True)

-RECORD 0------------------------------------
 VendorID              | 7                   
 tpep_pickup_datetime  | 2025-11-01 00:13:25 
 tpep_dropoff_datetime | 2025-11-01 00:13:25 
 passenger_count       | 1                   
 trip_distance         | 1.68                
 RatecodeID            | 1                   
 store_and_fwd_flag    | N                   
 PULocationID          | 43                  
 DOLocationID          | 186                 
 payment_type          | 1                   
 fare_amount           | 14.9                
 extra                 | 0.0                 
 mta_tax               | 0.5                 
 tip_amount            | 1.5                 
 tolls_amount          | 0.0                 
 improvement_surcharge | 1.0                 
 total_amount          | 22.15               
 congestion_surcharge  | 2.5                 
 Airport_fee           | 0.0                 
 cbd_congestion_fee    | 0.75                
 pickup_date           | 2025-11-0