### Creating spark session

Spark session for formatting and saving files only - around 5 GB of for-hire-vehicles data (2015-2024)

In [None]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import *
from pyspark.sql.types import *


spark = SparkSession.builder.appName("fhv") \
.config("spark.sql.pyspark.jvm", "false") \
.config("spark.driver.memory", "8g") \
.config("spark.executor.memory", "4gb") \
.getOrCreate()



Loading each year of fhv and checking column names/formats against sample file

In [None]:
df = (spark.read
        .format("parquet")
        .load("data/taxi/for_hire_vehicle/2018/*"))

df_sample = (spark.read
        .format("parquet")
        .load("data/taxi/sample/sample_FHV.parquet"))

spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

In [None]:
df

Adjusting column names, formats and dropping not business-releated columns

In [None]:
df = df \
.withColumn("dispatching_base_num", col("dispatching_base_num").cast("string")) \
.withColumn("PUlocationID", col("PUlocationID").cast("integer")) \
.withColumn("DOlocationID", col("DOlocationID").cast("integer")) \
.drop("SR_Flag") \
.drop("Affiliated_base_number")



In [None]:
def compare_schemas(df_sample, df):
    if df_sample.schema == df.schema:
        print("Schematy są IDENTYCZNE")
    else:
        print("RÓŻNICE W SCHEMATACH:")
        print("=" * 60)
       
        s1 = {f.name: (str(f.dataType), f.nullable) for f in df_sample.schema.fields}
        s2 = {f.name: (str(f.dataType), f.nullable) for f in df.schema.fields}
       
        all_cols = sorted(set(s1.keys()) | set(s2.keys()))
       
        for col in all_cols:
            in_df_sample = col in s1
            in_df = col in s2
           
            if not in_df_sample:
                print(f" tylko w df → {col:25} {s2[col][0]:20} (nullable={s2[col][1]})")
            elif not in_df:
                print(f" tylko w df_sample → {col:25} {s1[col][0]:20} (nullable={s1[col][1]})")
            elif s1[col] != s2[col]:
                print(f" RÓŻNICA → {col:25} df_sample: {s1[col][0]:15} (null={s1[col][1]}) | "
                      f"df: {s2[col][0]:15} (null={s2[col][1]})")
                
compare_schemas(df_sample, df)

In [None]:
df.printSchema()

Saving formatted data to parquet

In [None]:
(df
.write
.mode("overwrite")
.option("parquet.compression", "gzip")
.parquet("data/taxi/for_hire_vehicle/2018/formatted"))

#### For analytics please check '05_analytics' notebook