### Инофрмация о датасете

https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf

In [11]:
# Импортируем либы
from pyspark.sql import SparkSession

import pyspark.sql.functions as F

In [12]:
# Создаем сессию

spark = (
    SparkSession.builder
        .master("local")
        .appName("Basics-operations")
        .getOrCreate()
)

In [13]:
# Считываем данные

df = spark.read.parquet("data/nyc_taxi/")

In [74]:
# Информация о полях DataFrame

# The describe() function in Spark is used to compute summary statistics for numerical and string columns in a DataFrame. 
# It provides a quick way to understand the distribution of data, including count, mean, standard deviation, minimum, and maximum values. 
# This is particularly useful for exploratory data analysis (EDA) and data profiling.

df.describe().show()

+-------+------------------+------------------+-----------------+------------------+------------------+-----------------+-----------------+------------------+------------------+-----------------+------------------+------------------+-------------------+---------------------+------------------+--------------------+-------------------+--------------------+------------------+------------------+
|summary|          VendorID|   passenger_count|    trip_distance|        RatecodeID|store_and_fwd_flag|     PULocationID|     DOLocationID|      payment_type|       fare_amount|            extra|           mta_tax|        tip_amount|       tolls_amount|improvement_surcharge|      total_amount|congestion_surcharge|        airport_fee|            filename|              year|             month|
+-------+------------------+------------------+-----------------+------------------+------------------+-----------------+-----------------+------------------+------------------+-----------------+---------------

In [14]:
# Информация о schema

df.printSchema()

root
 |-- VendorID: long (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)
 |-- filename: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)



### SELECT — выбор колонок

In [18]:
# SELECT "VendorID", "trip_distance" FROM df;

df.select("VendorID", "trip_distance").show(5)

+--------+-------------+
|VendorID|trip_distance|
+--------+-------------+
|       1|          1.2|
|       1|          0.4|
|       1|          1.2|
|       1|          1.1|
|       1|          0.6|
+--------+-------------+
only showing top 5 rows



In [24]:
# SELECT * FROM df;

df.select("*").show(1, vertical=True)

-RECORD 0-------------------------------------
 VendorID              | 1                    
 tpep_pickup_datetime  | 2020-01-01 00:28:15  
 tpep_dropoff_datetime | 2020-01-01 00:33:03  
 passenger_count       | 1.0                  
 trip_distance         | 1.2                  
 RatecodeID            | 1.0                  
 store_and_fwd_flag    | N                    
 PULocationID          | 238                  
 DOLocationID          | 239                  
 payment_type          | 1                    
 fare_amount           | 6.0                  
 extra                 | 3.0                  
 mta_tax               | 0.5                  
 tip_amount            | 1.47                 
 tolls_amount          | 0.0                  
 improvement_surcharge | 0.3                  
 total_amount          | 11.27                
 congestion_surcharge  | 2.5                  
 airport_fee           | NULL                 
 filename              | file:///home/jovy... 
 year        

In [37]:
# Использование расчетов F.col
# SELECT
#     VendorID,
#     ROUND(total_amount / 1000, 2) AS total_amount
# FROM trips

df.select(F.col("VendorID"), F.round(F.col("total_amount")/1000,2).alias("total_amount")).show(5)

+--------+------------+
|VendorID|total_amount|
+--------+------------+
|       1|        0.01|
|       1|        0.01|
|       1|        0.01|
|       1|        0.01|
|       1|        0.01|
+--------+------------+
only showing top 5 rows



In [47]:
# Distinct, Case, Order By, Alias

(
    df
    .select("payment_type")
    .distinct()
    .orderBy("payment_type")
    .select(
        F
        .when(df['payment_type'] == 0, 'Flex Fare trip')
        .when(df['payment_type'] == 1, 'Credit card')
        .when(df['payment_type'] == 2, 'Cash')
        .when(df['payment_type'] == 3, 'No charge')
        .when(df['payment_type'] == 4, 'Dispute')
        .when(df['payment_type'] == 5, 'Unknown')
        .when(df['payment_type'] == 6, 'Voided trip')
        .otherwise('Unknown')
        .alias("payment_method")
    )
    .orderBy("payment_method")
    .show()
)

+--------------+
|payment_method|
+--------------+
|          Cash|
|   Credit card|
|       Dispute|
|Flex Fare trip|
|     No charge|
|       Unknown|
+--------------+



In [49]:
# Spark SQL

df.createOrReplaceTempView("trips")

spark.sql("""
    SELECT
        VendorID,
        ROUND(total_amount / 1000, 2) AS total_amount
    FROM trips
""").show(5)

+--------+------------+
|VendorID|total_amount|
+--------+------------+
|       1|        0.01|
|       1|        0.01|
|       1|        0.01|
|       1|        0.01|
|       1|        0.01|
+--------+------------+
only showing top 5 rows



In [52]:
# Можно продолжить использовать trips в других ячейках

spark.sql("""
    SELECT DISTINCT 
        payment_type
    FROM trips
    ORDER BY 1
"""
).show()

+------------+
|payment_type|
+------------+
|           0|
|           1|
|           2|
|           3|
|           4|
|           5|
+------------+



In [77]:
# lit - Констранта

# The lit() function in Spark is used to create a new column with a constant or literal value. 
# It is part of the pyspark.sql.functions module and is particularly useful when you need to add a column with a fixed value to a DataFrame. 
# This function is often used in combination with other transformations, such as withColumn().

df.select("VendorID", F.lit('Literal')).distinct().show()

+--------+-------+
|VendorID|Literal|
+--------+-------+
|       1|Literal|
|       5|Literal|
|       2|Literal|
|       6|Literal|
+--------+-------+



### SELECT EXPR - Использование SQL выражений

In [61]:
# Перечисление полей

df.selectExpr("VendorID", "ROUND(total_amount / 1000, 2) AS total_amount").show(5)

+--------+------------+
|VendorID|total_amount|
+--------+------------+
|       1|        0.01|
|       1|        0.01|
|       1|        0.01|
|       1|        0.01|
|       1|        0.01|
+--------+------------+
only showing top 5 rows



In [69]:
# CASE

(
    df
    .select("VendorID")
    .distinct()
    .selectExpr("VendorID",
    """
        CASE 
            WHEN VendorID = 1 THEN 'Creative Mobile Technologies, LLC'
            WHEN VendorID = 2 THEN 'Curb Mobility, LLC'
            WHEN VendorID = 6 THEN 'Myle Technologies Inc'
            WHEN VendorID = 7 THEN 'Helix'
            ELSE 'Unknown' 
        END AS text
    """)
    .orderBy("VendorID")
    .show(truncate=False)
)

+--------+---------------------------------+
|VendorID|text                             |
+--------+---------------------------------+
|1       |Creative Mobile Technologies, LLC|
|2       |Curb Mobility, LLC               |
|5       |Unknown                          |
|6       |Myle Technologies Inc            |
+--------+---------------------------------+



In [73]:
# AGGR

df\
.selectExpr(
    "AVG(total_amount) AS avg_total_amount", 
    "MAX(total_amount) AS max_total_amount", 
    "MIN(total_amount) AS min_total_amount"
)\
.show()

+-----------------+----------------+----------------+
| avg_total_amount|max_total_amount|min_total_amount|
+-----------------+----------------+----------------+
|20.10627028678565|       1000003.8|         -2567.8|
+-----------------+----------------+----------------+



### COL

The col() function in Spark is used to reference a column in a DataFrame. 

It is part of the pyspark.sql.functions module and is commonly used in DataFrame transformations, such as filtering, sorting, and aggregations. 

The col() function allows you to refer to columns dynamically and is particularly useful when working with complex expressions or when column names are stored in variables.

In [84]:
# Referencing a Column in a Filter Operation

df.filter(F.col("VendorID") == 1).show(1, vertical=True)

-RECORD 0-------------------------------------
 VendorID              | 1                    
 tpep_pickup_datetime  | 2020-01-01 00:28:15  
 tpep_dropoff_datetime | 2020-01-01 00:33:03  
 passenger_count       | 1.0                  
 trip_distance         | 1.2                  
 RatecodeID            | 1.0                  
 store_and_fwd_flag    | N                    
 PULocationID          | 238                  
 DOLocationID          | 239                  
 payment_type          | 1                    
 fare_amount           | 6.0                  
 extra                 | 3.0                  
 mta_tax               | 0.5                  
 tip_amount            | 1.47                 
 tolls_amount          | 0.0                  
 improvement_surcharge | 0.3                  
 total_amount          | 11.27                
 congestion_surcharge  | 2.5                  
 airport_fee           | NULL                 
 filename              | file:///home/jovy... 
 year        

In [91]:
df.filter(F.col("total_amount") > 30).show(1, vertical=True)

-RECORD 0-------------------------------------
 VendorID              | 2                    
 tpep_pickup_datetime  | 2020-01-28 19:19:36  
 tpep_dropoff_datetime | 2020-01-28 19:50:40  
 passenger_count       | 1.0                  
 trip_distance         | 17.59                
 RatecodeID            | 2.0                  
 store_and_fwd_flag    | N                    
 PULocationID          | 132                  
 DOLocationID          | 215                  
 payment_type          | 1                    
 fare_amount           | 52.0                 
 extra                 | 4.5                  
 mta_tax               | 0.5                  
 tip_amount            | 12.68                
 tolls_amount          | 6.12                 
 improvement_surcharge | 0.3                  
 total_amount          | 76.1                 
 congestion_surcharge  | 0.0                  
 airport_fee           | NULL                 
 filename              | file:///home/jovy... 
 year        

In [95]:
# Referencing a Column in a Select Operation

df.select(F.col("VendorID"), F.col("total_amount").alias('total_amnt')).show(5)

+--------+----------+
|VendorID|total_amnt|
+--------+----------+
|       1|     11.27|
|       1|      8.74|
|       1|      12.3|
|       1|     12.25|
|       1|      10.8|
+--------+----------+
only showing top 5 rows



In [101]:
# Using col() in Aggregations

df.groupBy(F.col("payment_type")).agg(F.sum(F.col("total_amount")).alias('total_amount')).orderBy("payment_type").show()

+------------+--------------------+
|payment_type|        total_amount|
+------------+--------------------+
|           0| 1.092547517000018E8|
|           1|1.3924048912388363E9|
|           2|3.3062123700528276E8|
|           3|   8246323.020001265|
|           4| -118468.51000007371|
|           5|   359.2700000000001|
+------------+--------------------+



In [108]:
# Using col() with Conditional Logic

df.withColumn(
    "AgeGroup", 
    F.when(F.col("total_amount") < 30, "Low")
    .otherwise("High")
).show(1, vertical=True)

-RECORD 0-------------------------------------
 VendorID              | 1                    
 tpep_pickup_datetime  | 2020-01-01 00:28:15  
 tpep_dropoff_datetime | 2020-01-01 00:33:03  
 passenger_count       | 1.0                  
 trip_distance         | 1.2                  
 RatecodeID            | 1.0                  
 store_and_fwd_flag    | N                    
 PULocationID          | 238                  
 DOLocationID          | 239                  
 payment_type          | 1                    
 fare_amount           | 6.0                  
 extra                 | 3.0                  
 mta_tax               | 0.5                  
 tip_amount            | 1.47                 
 tolls_amount          | 0.0                  
 improvement_surcharge | 0.3                  
 total_amount          | 11.27                
 congestion_surcharge  | 2.5                  
 airport_fee           | NULL                 
 filename              | file:///home/jovy... 
 year        

In [111]:
# Using col() with Mathematical Operations

df.withColumn("amount_rub", F.col("total_amount") * 85).select("VendorID", "total_amount", "amount_rub").show(5)

+--------+------------+-----------------+
|VendorID|total_amount|       amount_rub|
+--------+------------+-----------------+
|       1|       11.27|957.9499999999999|
|       1|        8.74|            742.9|
|       1|        12.3|           1045.5|
|       1|       12.25|          1041.25|
|       1|        10.8|918.0000000000001|
+--------+------------+-----------------+
only showing top 5 rows



### COLUMNS

The columns attribute in Spark is used to retrieve the list of column names in a DataFrame. 

It provides a quick and easy way to inspect the structure of the DataFrame and access the names of all columns. 

This is particularly useful for debugging, data exploration, and dynamic column access.

In [113]:
df.columns

['VendorID',
 'tpep_pickup_datetime',
 'tpep_dropoff_datetime',
 'passenger_count',
 'trip_distance',
 'RatecodeID',
 'store_and_fwd_flag',
 'PULocationID',
 'DOLocationID',
 'payment_type',
 'fare_amount',
 'extra',
 'mta_tax',
 'tip_amount',
 'tolls_amount',
 'improvement_surcharge',
 'total_amount',
 'congestion_surcharge',
 'airport_fee',
 'filename',
 'year',
 'month']

In [114]:
# Access columns dynamically

for column in df.columns:
    print(f"Column: {column}, Data Type: {df.schema[column].dataType}")

Column: VendorID, Data Type: LongType()
Column: tpep_pickup_datetime, Data Type: TimestampNTZType()
Column: tpep_dropoff_datetime, Data Type: TimestampNTZType()
Column: passenger_count, Data Type: DoubleType()
Column: trip_distance, Data Type: DoubleType()
Column: RatecodeID, Data Type: DoubleType()
Column: store_and_fwd_flag, Data Type: StringType()
Column: PULocationID, Data Type: LongType()
Column: DOLocationID, Data Type: LongType()
Column: payment_type, Data Type: LongType()
Column: fare_amount, Data Type: DoubleType()
Column: extra, Data Type: DoubleType()
Column: mta_tax, Data Type: DoubleType()
Column: tip_amount, Data Type: DoubleType()
Column: tolls_amount, Data Type: DoubleType()
Column: improvement_surcharge, Data Type: DoubleType()
Column: total_amount, Data Type: DoubleType()
Column: congestion_surcharge, Data Type: DoubleType()
Column: airport_fee, Data Type: DoubleType()
Column: filename, Data Type: StringType()
Column: year, Data Type: IntegerType()
Column: month, Data

### DTYPES

The dtypes attribute in Spark is used to retrieve the schema of a DataFrame in the form of a list of tuples.

Each tuple contains the column name and its corresponding data type. 

This is particularly useful for inspecting the structure of the data and understanding the data types of each column.

In [115]:
df.dtypes

[('VendorID', 'bigint'),
 ('tpep_pickup_datetime', 'timestamp_ntz'),
 ('tpep_dropoff_datetime', 'timestamp_ntz'),
 ('passenger_count', 'double'),
 ('trip_distance', 'double'),
 ('RatecodeID', 'double'),
 ('store_and_fwd_flag', 'string'),
 ('PULocationID', 'bigint'),
 ('DOLocationID', 'bigint'),
 ('payment_type', 'bigint'),
 ('fare_amount', 'double'),
 ('extra', 'double'),
 ('mta_tax', 'double'),
 ('tip_amount', 'double'),
 ('tolls_amount', 'double'),
 ('improvement_surcharge', 'double'),
 ('total_amount', 'double'),
 ('congestion_surcharge', 'double'),
 ('airport_fee', 'double'),
 ('filename', 'string'),
 ('year', 'int'),
 ('month', 'int')]

### printSchema

The printSchema() function in Spark is used to display the schema of a DataFrame or Dataset. 

It provides a tree-like structure that shows the column names, data types, and whether the columns are nullable. 

This is particularly useful for understanding the structure of the data and debugging schema-related issues.

In [116]:
df.printSchema()

root
 |-- VendorID: long (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)
 |-- filename: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)



### EXPLAIN

The explain() function in Spark is used to display the execution plan of a DataFrame or Dataset operation. 

It provides detailed information about how Spark will execute a query, including the logical and physical plans. 

This is particularly useful for debugging, optimizing performance, and understanding the underlying execution process.

In [117]:
df.explain(mode="extended")

== Parsed Logical Plan ==
Relation [VendorID#44L,tpep_pickup_datetime#45,tpep_dropoff_datetime#46,passenger_count#47,trip_distance#48,RatecodeID#49,store_and_fwd_flag#50,PULocationID#51L,DOLocationID#52L,payment_type#53L,fare_amount#54,extra#55,mta_tax#56,tip_amount#57,tolls_amount#58,improvement_surcharge#59,total_amount#60,congestion_surcharge#61,airport_fee#62,filename#63,year#64,month#65] parquet

== Analyzed Logical Plan ==
VendorID: bigint, tpep_pickup_datetime: timestamp_ntz, tpep_dropoff_datetime: timestamp_ntz, passenger_count: double, trip_distance: double, RatecodeID: double, store_and_fwd_flag: string, PULocationID: bigint, DOLocationID: bigint, payment_type: bigint, fare_amount: double, extra: double, mta_tax: double, tip_amount: double, tolls_amount: double, improvement_surcharge: double, total_amount: double, congestion_surcharge: double, airport_fee: double, filename: string, year: int, month: int
Relation [VendorID#44L,tpep_pickup_datetime#45,tpep_dropoff_datetime#46,p

In [120]:
df.filter("total_amount > 100").explain(mode="extended")

== Parsed Logical Plan ==
'Filter ('total_amount > 100)
+- Relation [VendorID#44L,tpep_pickup_datetime#45,tpep_dropoff_datetime#46,passenger_count#47,trip_distance#48,RatecodeID#49,store_and_fwd_flag#50,PULocationID#51L,DOLocationID#52L,payment_type#53L,fare_amount#54,extra#55,mta_tax#56,tip_amount#57,tolls_amount#58,improvement_surcharge#59,total_amount#60,congestion_surcharge#61,airport_fee#62,filename#63,year#64,month#65] parquet

== Analyzed Logical Plan ==
VendorID: bigint, tpep_pickup_datetime: timestamp_ntz, tpep_dropoff_datetime: timestamp_ntz, passenger_count: double, trip_distance: double, RatecodeID: double, store_and_fwd_flag: string, PULocationID: bigint, DOLocationID: bigint, payment_type: bigint, fare_amount: double, extra: double, mta_tax: double, tip_amount: double, tolls_amount: double, improvement_surcharge: double, total_amount: double, congestion_surcharge: double, airport_fee: double, filename: string, year: int, month: int
Filter (total_amount#60 > cast(100 as do

### DROP

The drop() command in Spark is used to remove one or more columns from a DataFrame. 

This is particularly useful when you need to clean up your dataset by removing unnecessary or redundant columns.

In [121]:
df.select("VendorID", "total_amount").drop("total_amount").show(5)

+--------+
|VendorID|
+--------+
|       1|
|       1|
|       1|
|       1|
|       1|
+--------+
only showing top 5 rows



In [128]:
# Dropping columns using a list

columns_to_drop = ["VendorID", "total_amount", "tpep_pickup_datetime", "tpep_dropoff_datetime", "filename", "store_and_fwd_flag"]
df_dropped = df.drop(*columns_to_drop)
df_dropped.show(1, vertical=True)

-RECORD 0---------------------
 passenger_count       | 1.0  
 trip_distance         | 1.2  
 RatecodeID            | 1.0  
 PULocationID          | 238  
 DOLocationID          | 239  
 payment_type          | 1    
 fare_amount           | 6.0  
 extra                 | 3.0  
 mta_tax               | 0.5  
 tip_amount            | 1.47 
 tolls_amount          | 0.0  
 improvement_surcharge | 0.3  
 congestion_surcharge  | 2.5  
 airport_fee           | NULL 
 year                  | 2020 
 month                 | 1    
only showing top 1 row



### Spark: dropDuplicates function

The dropDuplicates() command in Spark is used to remove duplicate rows from a DataFrame. 

It is similar to the distinct() command but provides more flexibility by allowing you to specify a subset of columns to consider when identifying duplicates. 

This is particularly useful when you want to remove duplicates based on specific columns rather than the entire row.

In [136]:
df.filter((F.col("year") == 2020) & (F.col("month") == 1)).dropDuplicates().show(1, vertical=True)

-RECORD 0-------------------------------------
 VendorID              | 2                    
 tpep_pickup_datetime  | 2020-01-01 11:23:55  
 tpep_dropoff_datetime | 2020-01-01 11:52:55  
 passenger_count       | 1.0                  
 trip_distance         | 13.87                
 RatecodeID            | 3.0                  
 store_and_fwd_flag    | N                    
 PULocationID          | 13                   
 DOLocationID          | 1                    
 payment_type          | 2                    
 fare_amount           | 57.0                 
 extra                 | 0.0                  
 mta_tax               | 0.0                  
 tip_amount            | 0.0                  
 tolls_amount          | 10.5                 
 improvement_surcharge | 0.3                  
 total_amount          | 67.8                 
 congestion_surcharge  | 0.0                  
 airport_fee           | NULL                 
 filename              | file:///home/jovy... 
 year        

### Spark: filter or where function

The filter() or where() command in Spark is used to filter rows from a DataFrame based on a specified condition. 

Both filter() and where() are interchangeable and can be used to achieve the same result. 

The primary purpose of these commands is to select a subset of rows that meet a given condition.

In [137]:
# WHERE

df.select("VendorID").filter("year = 2020 and month = 1").distinct().show()

+--------+
|VendorID|
+--------+
|       5|
|       1|
|       2|
+--------+



In [59]:
df.select("VendorID").filter(F.col("year") == 2020).distinct().show()

+--------+
|VendorID|
+--------+
|       5|
|       1|
|       2|
|       6|
+--------+

