In [1]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import *
from pyspark.sql.types import *


spark = SparkSession.builder.appName("taxi") \
.config("spark.sql.pyspark.jvm", "false") \
.config("spark.driver.memory", "8g") \
.config("spark.executor.memory", "4gb") \
.getOrCreate()



In [196]:
df = (spark.read
        .format("parquet")
        .load("data/taxi/for_hire_vehicle/2018/*"))

spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

In [197]:
df_sample = (spark.read
        .format("parquet")
        .load("data/taxi/for_hire_vehicle/sample.parquet"))
	
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

In [111]:
df

dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID
B00008,2024-02-01 00:00:00,2024-02-01 00:17:00,,
B00008,2024-02-01 00:30:00,2024-02-01 01:03:00,,
B00009,2024-02-01 00:00:00,2024-02-01 00:13:00,,
B00009,2024-02-01 00:00:00,2024-02-01 00:41:00,,
B00014,2024-02-01 00:46:26,2024-02-01 01:25:00,,
B00111,2024-02-01 00:26:38,2024-02-01 00:51:00,,
B00112,2024-02-01 00:46:26,2024-02-01 00:57:04,,227.0
B00112,2024-02-01 00:56:08,2024-02-01 01:03:06,,14.0
B00149,2024-02-01 00:42:58,2024-02-01 00:51:19,,62.0
B00149,2024-02-01 00:44:10,2024-02-01 00:55:52,,61.0


In [198]:
df = df \
.withColumn("dispatching_base_num", col("dispatching_base_num").cast("string")) \
.withColumn("PUlocationID", col("PUlocationID").cast("integer")) \
.withColumn("DOlocationID", col("DOlocationID").cast("integer")) \
.drop("SR_Flag") \
.drop("Affiliated_base_number")



In [199]:
def compare_schemas(df_sample, df):
    if df_sample.schema == df.schema:
        print("Schematy są IDENTYCZNE")
    else:
        print("RÓŻNICE W SCHEMATACH:")
        print("=" * 60)
       
        s1 = {f.name: (str(f.dataType), f.nullable) for f in df_sample.schema.fields}
        s2 = {f.name: (str(f.dataType), f.nullable) for f in df.schema.fields}
       
        all_cols = sorted(set(s1.keys()) | set(s2.keys()))
       
        for col in all_cols:
            in_df_sample = col in s1
            in_df = col in s2
           
            if not in_df_sample:
                print(f" tylko w df → {col:25} {s2[col][0]:20} (nullable={s2[col][1]})")
            elif not in_df:
                print(f" tylko w df_sample → {col:25} {s1[col][0]:20} (nullable={s1[col][1]})")
            elif s1[col] != s2[col]:
                print(f" RÓŻNICA → {col:25} df_sample: {s1[col][0]:15} (null={s1[col][1]}) | "
                      f"df: {s2[col][0]:15} (null={s2[col][1]})")
                
compare_schemas(df_sample, df)

Schematy są IDENTYCZNE


In [200]:
df.printSchema()

root
 |-- dispatching_base_num: string (nullable = true)
 |-- pickup_datetime: timestamp_ntz (nullable = true)
 |-- dropOff_datetime: timestamp_ntz (nullable = true)
 |-- PUlocationID: integer (nullable = true)
 |-- DOlocationID: integer (nullable = true)



In [201]:
df.groupBy("dispatching_base_num") \
  .count() \
  .orderBy(col("dispatching_base_num").asc()) \
  .show(20)

+--------------------+------+
|dispatching_base_num| count|
+--------------------+------+
|                NULL|    49|
|              B00001| 13336|
|              B00008| 34619|
|              B00009| 41278|
|              B00013| 38642|
|              B00014| 63560|
|              B00021| 19608|
|     B00021         | 29710|
|              B00030|  8795|
|              B00031|  9989|
|              B00037| 88039|
|              B00039| 14599|
|              B00053| 10251|
|              B00054|  7707|
|              B00056| 17432|
|              B00078|  7361|
|              B00095| 53664|
|              B00111|158941|
|              B00112| 54084|
|              B00131| 16130|
+--------------------+------+
only showing top 20 rows



In [202]:
#############ZMIENIĆ ROK!!!############
(df
.write
.mode("overwrite")
.option("parquet.compression", "gzip")
.parquet("data/taxi/for_hire_vehicle/2018/formatted"))
####################################^^^^^^#########################