In [39]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import *
from pyspark.sql.types import *



spark = SparkSession.builder.appName("Parking") \
.config("spark.driver.memory", "16g") \
.config("spark.executor.memory", "16g") \
.getOrCreate()



In [40]:
df = (spark.read
        .option("inferSchema", "true")
        .format("csv")
        .option("header", "true")
        .load("data/parking/Parking_Violations_Issued_-_Fiscal_Year_2024_20260114.csv"))

spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

In [41]:
df

Summons Number,Plate ID,Registration State,Plate Type,Issue Date,Violation Code,Vehicle Body Type,Vehicle Make,Issuing Agency,Street Code1,Street Code2,Street Code3,Vehicle Expiration Date,Violation Location,Violation Precinct,Issuer Precinct,Issuer Code,Issuer Command,Issuer Squad,Violation Time,Time First Observed,Violation County,Violation In Front Of Or Opposite,House Number,Street Name,Intersecting Street,Date First Observed,Law Section,Sub Division,Violation Legal Code,Days Parking In Effect,From Hours In Effect,To Hours In Effect,Vehicle Color,Unregistered Vehicle?,Vehicle Year,Meter Number,Feet From Curb,Violation Post Code,Violation Description,No Standing or Stopping Violation,Hydrant Violation,Double Parking Violation
1159637337,KZH2758,NY,PAS,06/09/2023,67,VAN,HONDA,P,0,0,0,20250201,43,43,43,972773,43,0,0911A,,BX,,,I/O TAYLOR AVE,GUERLAIN,0,408,E5,,BBBBBBB,ALL,ALL,BLUE,0,2006,-,0,,,,,
1252960645,JPD8746,NY,PAS,06/30/2023,87,SUBN,LINCO,M,17870,25390,32670,20240210,14,14,968,271057,968,0,0717A,,NY,O,51,E 44TH ST,,0,408,D,,BBBBBBB,ALL,ALL,GRAY,0,2020,-,0,,,,,
1252960669,JPD8746,NY,PAS,06/30/2023,31,SUBN,LINCO,M,17870,25390,32670,20240210,14,14,968,271057,968,0,0823A,,,O,51,E 44TH STREET,,0,408,O3,,BBBBBBB,ALL,ALL,GRAY,0,2020,-,0,,,,,
1252994126,MBH9245,99,PAS,07/06/2023,20,SDN,KIA,M,12690,41700,61090,20231231,108,108,968,272834,968,0,1150P,,,F,39-41,60TH ST,,0,408,D,,BBBBBBB,ALL,ALL,WHITE,0,0,-,0,,,,,
1252994175,MBH9245,PA,PAS,07/08/2023,40,SDN,KIA,M,12690,41700,61090,20231231,108,108,968,272834,968,0,1150P,,Q,F,39-41,60TH ST,,0,408,D,,BBBBBBB,ALL,ALL,WHITE,0,0,-,0,,,,,
1307574919,LBZ7486,NY,PAS,07/01/2023,14,SUBN,HONDA,P,0,0,0,20250518,110,110,169,947343,169,0,0618P,,Q,,,C/O 126 ST,38 AVE,0,408,E2,,BBBBBBB,ALL,ALL,BK,0,2014,-,0,,,,,
1307574944,JRB4166,FL,PAS,06/07/2023,46,SUBN,HONDA,P,8690,21690,21740,20240312,110,110,169,947343,169,0,0246P,,Q,F,126 05,36 AVE,,0,408,C3,,BBBBBBB,ALL,ALL,,0,0,-,0,,,,,
1307575950,LCA7360J,NY,PAS,06/21/2023,46,SUBN,HONDA,P,0,61090,0,20250430,110,110,169,969119,169,0,0620P,,Q,,,STADIUM P N,ROOSEVELT AVE,0,408,F2,,BBBBBBB,ALL,ALL,GRY,0,2015,-,0,,,,,
1307575973,HNE3840,NY,PAS,06/21/2023,14,SDN,MAZDA,P,0,0,0,20250221,110,110,169,969119,169,0,0625P,,Q,,,STADIUM P N,ROOSEVELT,0,408,C,,BBBBBBB,ALL,ALL,WHT,0,2020,-,0,,,,,
1307576886,LAG7093,NY,OMT,06/03/2023,46,SDN,HONDA,P,0,40404,40404,20250129,110,110,110,961387,110,0,0230P,,Q,F,41,SEAVER WAY,,0,408,F7,,BBBBBBB,ALL,ALL,BLUE,0,2006,-,0,,,,,


In [50]:
df.printSchema()

root
 |-- Summons Number: long (nullable = true)
 |-- Plate ID: string (nullable = true)
 |-- Registration State: string (nullable = true)
 |-- Plate Type: string (nullable = true)
 |-- Issue Date: string (nullable = true)
 |-- Violation Code: integer (nullable = true)
 |-- Vehicle Body Type: string (nullable = true)
 |-- Vehicle Make: string (nullable = true)
 |-- Issuing Agency: string (nullable = true)
 |-- Street Code1: integer (nullable = true)
 |-- Street Code2: integer (nullable = true)
 |-- Street Code3: integer (nullable = true)
 |-- Vehicle Expiration Date: integer (nullable = true)
 |-- Violation Location: integer (nullable = true)
 |-- Violation Precinct: integer (nullable = true)
 |-- Issuer Precinct: integer (nullable = true)
 |-- Issuer Code: integer (nullable = true)
 |-- Issuer Command: string (nullable = true)
 |-- Issuer Squad: string (nullable = true)
 |-- Violation Time: string (nullable = true)
 |-- Time First Observed: string (nullable = true)
 |-- Violation Coun

In [51]:
df = df.withColumn("Issue_Date",
    to_date(col("Issue Date"), "MM/dd/yyyy")
)

In [59]:
df_filtered = df.where(
    "`Issue_Date` IS NOT NULL "
    "AND `Issue_Date` BETWEEN '2024-07-01' AND '2024-07-30'"
    "AND 'Plate ID' IS NOT NULL "
    "AND 'Issuing Agency' IS NOT NULL "
)



In [64]:
df_filtered = df_filtered.drop('Issue Date')

In [78]:
df_filtered

Summons_Number,Plate_ID,Registration_State,Plate_Type,Violation_Code,Vehicle_Body_Type,Vehicle_Make,Issuing_Agency,Street_Code1,Street_Code2,Street_Code3,Vehicle_Expiration_Date,Violation_Location,Violation_Precinct,Issuer_Precinct,Issuer_Code,Issuer_Command,Issuer_Squad,Violation_Time,Time_First_Observed,Violation_County,Violation_In_Front_Of_Or_Opposite,House_Number,Street_Name,Intersecting_Street,Date_First_Observed,Law_Section,Sub_Division,Violation_Legal_Code,Days_Parking_In_Effect,From_Hours_In_Effect,To_Hours_In_Effect,Vehicle_Color,Unregistered_Vehicle,Vehicle_Year,Meter_Number,Feet_From_Curb,Violation_Post_Code,Violation_Description,No_Standing_or_Stopping_Violation,Hydrant_Violation,Double_Parking_Violation,Issue_Date
1471668757,KMP3837,NY,PAS,40,,HYUND,P,0,0,0,20240111,107,107,107,972034,107,0,1134P,,Q,,,C/O 143 ST,85 RD,0,408,J2,,BBBBBBB,ALL,ALL,GY,0,2006,-,0,,,,,,2024-07-03
1472779290,HKL8836,NY,PAS,46,SUBN,ACURA,P,59430,26230,13830,20240915,66,66,66,962278,66,0,1030P,,K,F,524,MCDONALD AVE,,0,408,E2,,BBBBBBB,ALL,ALL,BLK,0,2017,-,0,,,,,,2024-07-05
1473040954,KVW5972,NY,PAS,14,SUBN,,P,17900,52550,34255,20240419,120,120,120,968927,120,0,1004P,1000P,,,143,BEACH STREET,,0,408,E3,,BBBBBBB,ALL,ALL,,0,0,-,0,,,,,,2024-07-01
1487860833,KYC2021,NY,PAS,20,SUBN,HONDA,P,72320,26850,26880,20240711,42,42,42,961435,42,0,0640P,,BX,F,1701,VYSE AVE,,0,408,F2,,BBBBBBB,ALL,ALL,GRN,0,2007,-,0,,,,,,2024-07-09
1488237244,HXM1945,NY,PAS,46,SUBN,LEXUS,P,59430,26230,13830,20240118,66,66,66,962278,66,0,1017P,,K,F,503,MCDONALD AVE,,0,408,F1,,BBBBBBB,ALL,ALL,B,0,2022,-,0,,,,,,2024-07-05
1488420660,HFV4382,NY,PAS,98,SUBN,JEEP,P,32030,23230,23930,20240612,83,83,83,973084,83,0,0954P,,K,F,984,DECATUR ST,,0,408,F1,,BBBBBBB,ALL,ALL,GRAY,0,2010,-,0,,,,,,2024-07-10
1488846212,KUH4974,NY,PAS,51,SUBN,PORSC,P,44900,45224,39685,20240109,122,122,122,964913,122,0,1035P,,R,F,3185,RICHMOND RD,,0,408,E2,,BBBBBBB,ALL,ALL,GRAY,0,2022,-,0,,,,,,2024-07-03
1491468701,KZH3448,NY,PAS,46,SDN,HONDA,P,8720,74240,74250,20240324,44,44,44,975263,44,0,0740P,0740P,BX,O,1125,ANDERSON AVE,,20 240 708,408,D,,BBBBBBB,ALL,ALL,GRY,0,2022,-,0,,,,,,2024-07-08
1491945825,KXM3857,NY,PAS,40,SUBN,LEXUS,P,28530,18120,47220,20240807,47,47,47,969047,47,0,0245A,,BX,F,655,EAST 230TH ST,,0,408,F1,,BBBBBBB,ALL,ALL,,0,0,-,0,,,,,,2024-07-07
1492271135,HHA8143,NY,PAS,46,SDN,ACURA,P,37610,12010,20790,20240605,34,34,34,974138,34,0,0824P,,NY,F,562,W 193RD,,0,408,E2,,BBBBBBB,ALL,ALL,,0,0,-,0,,,,,,2024-07-08


In [73]:
print(df_filtered.columns)

['Summons Number', 'Plate ID', 'Registration State', 'Plate Type', 'Violation Code', 'Vehicle Body Type', 'Vehicle Make', 'Issuing Agency', 'Street Code1', 'Street Code2', 'Street Code3', 'Vehicle Expiration Date', 'Violation Location', 'Violation Precinct', 'Issuer Precinct', 'Issuer Code', 'Issuer Command', 'Issuer Squad', 'Violation Time', 'Time First Observed', 'Violation County', 'Violation In Front Of Or Opposite', 'House Number', 'Street Name', 'Intersecting Street', 'Date First Observed', 'Law Section', 'Sub Division', 'Violation Legal Code', 'Days Parking In Effect    ', 'From Hours In Effect', 'To Hours In Effect', 'Vehicle Color', 'Unregistered Vehicle?', 'Vehicle Year', 'Meter Number', 'Feet From Curb', 'Violation Post Code', 'Violation Description', 'No Standing or Stopping Violation', 'Hydrant Violation', 'Double Parking Violation', 'Issue_Date']


In [76]:
cleaned = [
    col.strip()
       .replace(" ", "_")
       .replace("?", "")
       .replace("    ", "")    
    for col in df_filtered.columns
]

In [77]:
df_filtered = df_filtered.toDF(*cleaned)

In [89]:
(df_filtered.coalesce(1).write
  .option("header", "true")
  .mode("append")
  .csv("data/parking/"))

In [79]:
delta_path = "/home/jovyan/work/data/delta/parking_new"

In [80]:
df_filtered.write.format("delta").mode("overwrite").save(delta_path)


In [81]:
spark.read.format("delta").load(delta_path)

Summons_Number,Plate_ID,Registration_State,Plate_Type,Violation_Code,Vehicle_Body_Type,Vehicle_Make,Issuing_Agency,Street_Code1,Street_Code2,Street_Code3,Vehicle_Expiration_Date,Violation_Location,Violation_Precinct,Issuer_Precinct,Issuer_Code,Issuer_Command,Issuer_Squad,Violation_Time,Time_First_Observed,Violation_County,Violation_In_Front_Of_Or_Opposite,House_Number,Street_Name,Intersecting_Street,Date_First_Observed,Law_Section,Sub_Division,Violation_Legal_Code,Days_Parking_In_Effect,From_Hours_In_Effect,To_Hours_In_Effect,Vehicle_Color,Unregistered_Vehicle,Vehicle_Year,Meter_Number,Feet_From_Curb,Violation_Post_Code,Violation_Description,No_Standing_or_Stopping_Violation,Hydrant_Violation,Double_Parking_Violation,Issue_Date
9137409013,LKR2563,NY,PAS,50,SUBN,NISSA,T,0,0,0,20260527,34,34,34,356959,T103,BB,0547A,,NY,I,S,W 204th St,0ft E/of 10th Ave,0,408,E5,,YYYYYYY,,,GY,,2013,,0,E,50-Crosswalk,,,,2024-07-01
9137409025,5DMJ54,MA,PAS,40,SUBN,FORD,T,0,0,0,20248888,34,34,34,356959,T103,BB,0548A,,NY,I,N,W 204th St,0ft E/of 10th Ave,0,408,E2,,YYYYYYY,,,BLACK,,0,,5,E,40-Fire Hydrant,,,,2024-07-01
9137409037,KAH9680,NY,PAS,20,4DSD,TOYOT,T,37630,10910,13113,20251001,34,34,34,356959,T103,BB,0550A,,NY,O,428,W 204th St,,0,408,D,,YYYYYYY,,,BK,,2006,,0,E,20A-No Parking (N...,,,,2024-07-01
9137409049,BP22944,CT,PAS,20,SUBN,SUBAR,T,37630,10910,13113,20248888,34,34,34,356959,T103,BB,0551A,,NY,O,428,W 204th St,,0,408,D,,YYYYYYY,,,BLACK,,0,,0,E,20A-No Parking (N...,,,,2024-07-01
9137409050,S1939680,GA,PAS,20,SUBN,ME/BE,T,11010,0,0,20240888,34,34,34,356959,T103,BB,0554A,,NY,I,E,10th Ave,80ft S/of W 205th St,0,408,D,,YYYYYYY,,,WHITE,,0,,0,E,20A-No Parking (N...,,,,2024-07-01
9137409062,S1948142,GA,PAS,20,4DSD,HONDA,T,11010,0,0,20240888,34,34,34,356959,T103,BB,0556A,,NY,I,E,10th Ave,60ft S/of W 205th St,0,408,D,,YYYYYYY,,,GREY,,0,,0,E,20A-No Parking (N...,,,,2024-07-01
9137409074,LEX8208,NY,PAS,46,SUBN,JEEP,T,11010,27090,37670,20250907,34,34,34,356959,T103,BB,0558A,,NY,F,3849,10th Ave,,0,408,F1,,YYYYYYY,,,GY,,2019,,0,E,46A-Double Parkin...,,,,2024-07-01
9137409086,LHT6291,NY,PAS,46,4DSD,ACURA,T,11010,27090,37670,20260213,34,34,34,356959,T103,BB,0558A,,NY,F,3849,10th Ave,,0,408,F1,,YYYYYYY,,,RD,,2021,,0,E,46A-Double Parkin...,,,,2024-07-01
9137409098,B63PUK,NJ,PAS,46,SUBN,ROVER,T,11010,27090,37670,88888888,34,34,34,356959,T103,BB,0559A,,NY,F,3849,10th Ave,,0,408,F1,,YYYYYYY,,,RED,,0,,0,E,46A-Double Parkin...,,,,2024-07-01
9137409104,KWR2855,NY,PAS,46,4DSD,ME/BE,T,11010,27090,37670,20260331,34,34,34,356959,T103,BB,0559A,,NY,F,3849,10th Ave,,0,408,F1,,YYYYYYY,,,WH,,2018,,0,E,46A-Double Parkin...,,,,2024-07-01


In [84]:
aaa = spark.sql("""
  SELECT COUNT(*) 
  FROM delta.`/home/jovyan/work/data/delta/parking_new`
""")

In [85]:
aaa

count(1)
1410237
