# Feature Engineering

In [1]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sbs
import geopandas as gpd
import folium
import sklearn

In [2]:
from pyspark.sql import SparkSession

# Create a spark session with increased memory allocation
spark = (
    SparkSession.builder.appName("ADS Project1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "8g")  # Set the driver memory to 8GB
    .config("spark.executor.memory", "8g")  # Set the executor memory to 8GB
    .getOrCreate()
)


24/08/27 16:16:51 WARN Utils: Your hostname, Hanshis-Laptop.local resolves to a loopback address: 127.0.0.1; using 10.12.200.32 instead (on interface en0)
24/08/27 16:16:51 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/27 16:16:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/08/27 16:16:52 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/08/27 16:16:52 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [11]:
# Read the parquet file
df = spark.read.parquet("../data/raw/tlc_data/combined.parquet")

In [12]:
# Show the schema
df.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- pickup_datetime: timestamp_ntz (nullable = true)
 |-- dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- store_and_fwd_flag: boolean (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- ehail_fee: double (nullable = true)
 |-- trip_duration: double (nullable = true)



In [15]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline

# Define the columns to be one-hot encoded
columns_to_encode = ['VendorID', 'RatecodeID', 'payment_type']

# StringIndexer for converting categorical string columns to indexed numeric columns
indexers = [StringIndexer(inputCol=column, outputCol=column + "_index") for column in columns_to_encode]

# OneHotEncoder for converting indexed numeric columns to one-hot encoded vectors
encoders = [OneHotEncoder(inputCol=column + "_index", outputCol=column + "_onehot") for column in columns_to_encode]

# Creating a pipeline to apply indexers and encoders sequentially
pipeline = Pipeline(stages=indexers + encoders)

# Fit and transform the dataframe
df_encoded = pipeline.fit(df).transform(df)

# Drop the original and indexed columns after encoding
columns_to_drop = columns_to_encode + [column + "_index" for column in columns_to_encode]
df_final = df_encoded.drop(*columns_to_drop)

# Show the resulting DataFrame
df_final.show()

+-------------------+-------------------+---------------+-------------+------------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+---------+-------------+------------+----------+-----------+---------------+-----------------+-------------------+
|    pickup_datetime|   dropoff_datetime|passenger_count|trip_distance|store_and_fwd_flag|PULocationID|DOLocationID|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|ehail_fee|trip_duration|pickup_month|pickup_day|pickup_hour|VendorID_onehot|RatecodeID_onehot|payment_type_onehot|
+-------------------+-------------------+---------------+-------------+------------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+---------+-------------+------------+----------+-----------+---------------+-----------------+----------