In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, regexp_extract
from pyspark.ml.feature import StringIndexer, StandardScaler, VectorAssembler
from pyspark.ml import Pipeline

# Initialize Spark session
spark = SparkSession.builder.appName("FeatureEngineering").getOrCreate()

# Load cleaned data
cleaned_data_path = "gs://expedia-flight-prices/Cleaned/itineraries_cleaned.parquet/*"
df = spark.read.parquet(cleaned_data_path)

# Convert `isBasicEconomy` and `isRefundable` from True/False to 0/1
df = df.withColumn("isBasicEconomy", when(col("isBasicEconomy") == True, 1).otherwise(0))
df = df.withColumn("isRefundable", when(col("isRefundable") == True, 1).otherwise(0))

# Extract hours and minutes from the `travelDuration` column
df = df.withColumn("hours", regexp_extract(col("travelDuration"), "PT(\\d+)H", 1).cast("int"))
df = df.withColumn("minutes", regexp_extract(col("travelDuration"), "(\\d+)M", 1).cast("int"))

# Replace nulls with 0 for any missing hours or minutes
df = df.fillna({"hours": 0, "minutes": 0})

# Calculate total travel duration in minutes
df = df.withColumn("travelDurationMinutes", col("hours") * 60 + col("minutes"))

# Drop the temporary columns if no longer needed
df = df.drop("hours", "minutes")

# Define categorical and numeric columns
categorical_columns = ["startingAirport", "destinationAirport"]
numeric_columns = ["travelDurationMinutes", "elapsedDays", "totalTravelDistance"]

# StringIndexer for categorical features
indexers = [StringIndexer(inputCol=col, outputCol=col + "_index") for col in categorical_columns]

# VectorAssembler to combine feature columns into a feature vector
assembler = VectorAssembler(
    inputCols=[col + "_index" for col in categorical_columns] + numeric_columns,
    outputCol="features"
)

# StandardScaler to scale the features
scaler = StandardScaler(inputCol="features", outputCol="scaled_features")

# Create a pipeline for feature engineering
feature_pipeline = Pipeline(stages=indexers + [assembler, scaler])

# Transform the data
processed_data = feature_pipeline.fit(df).transform(df)

# Save processed data to the /trusted folder
processed_data.write.parquet("gs://expedia-flight-prices/Trusted/itineraries_processed.parquet", mode="overwrite")

print("Feature engineering completed and data saved to the /Trusted folder.")


                                                                                

Feature engineering completed and data saved to the /Trusted folder.
