# High-level Preprocessing Steps:
    1. Change column names to 

In [None]:
from pyspark.sql import SparkSession, functions as F

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("Preprocess Data")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .getOrCreate()
)

In [23]:
sdf = spark.read.parquet('../data/raw/tlc_data')

In [24]:
sdf.show(1, vertical=True, truncate=100)

-RECORD 0------------------------------------
 VendorID              | 1                   
 tpep_pickup_datetime  | 2022-03-01 11:13:08 
 tpep_dropoff_datetime | 2022-03-01 11:24:35 
 passenger_count       | 1.0                 
 trip_distance         | 2.4                 
 RatecodeID            | 1.0                 
 store_and_fwd_flag    | N                   
 PULocationID          | 90                  
 DOLocationID          | 209                 
 payment_type          | 2                   
 fare_amount           | 10.0                
 extra                 | 3.0                 
 mta_tax               | 0.5                 
 tip_amount            | 0.0                 
 tolls_amount          | 0.0                 
 improvement_surcharge | 0.3                 
 total_amount          | 13.8                
 congestion_surcharge  | 2.5                 
 airport_fee           | 0.0                 
only showing top 1 row



In [29]:
# renaming a few columns
field_name_change = {"VendorID": "vendor_id", "RatecodeID": "ratecode_id", 
                      "PULocationID": "pu_location_id", 
                      "DOLocationID": "do_location_id"}
for old, new in field_name_change.items():
    sdf = sdf.withColumnRenamed(old, new)
    
# converting a couple columns to integers and saving it
for field in ('pu_location_id', 'do_location_id', 'vendor_id', 'payment_type'):
    sdf = sdf.withColumn(
        field,
        F.col(field).cast('INT')
    )
    
sdf = sdf.withColumn(
    'store_and_fwd_flag',
    (F.col("store_and_fwd_flag") == 'Y').cast('BOOLEAN')
)
    
sdf.printSchema()

root
 |-- vendor_id: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- ratecode_id: double (nullable = true)
 |-- store_and_fwd_flag: boolean (nullable = true)
 |-- pu_location_id: integer (nullable = true)
 |-- do_location_id: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)



In [30]:
sdf.limit(5)

vendor_id,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,ratecode_id,store_and_fwd_flag,pu_location_id,do_location_id,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
1,2022-03-01 11:13:08,2022-03-01 11:24:35,1.0,2.4,1.0,False,90,209,2,10.0,3.0,0.5,0.0,0.0,0.3,13.8,2.5,0.0
1,2022-03-01 11:47:52,2022-03-01 12:00:08,1.0,2.2,1.0,False,148,234,2,10.5,3.0,0.5,0.0,0.0,0.3,14.3,2.5,0.0
2,2022-03-01 11:02:46,2022-03-01 11:46:43,1.0,19.78,2.0,False,132,249,1,52.0,0.0,0.5,11.06,0.0,0.3,67.61,2.5,1.25
2,2022-03-01 11:52:43,2022-03-01 12:03:40,2.0,2.94,1.0,False,211,66,1,11.0,0.5,0.5,4.44,0.0,0.3,19.24,2.5,0.0
2,2022-03-01 11:15:35,2022-03-01 11:34:13,1.0,8.57,1.0,False,138,197,1,25.0,0.5,0.5,5.51,0.0,0.3,33.06,0.0,1.25


In [31]:
os.getcwd()

'/Users/patrick/Documents/GitHub/mast30034-project-1-pl0203/scripts'

In [33]:
SAMPLE_SIZE = 0.01
df = sdf.sample(SAMPLE_SIZE, seed=0).toPandas()
df.to_csv('../data/raw/other_data/sample_data.csv', index=False)

                                                                                