In [11]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

In [12]:
import logging

# Set the logging level to ERROR
logging.getLogger("py4j").setLevel(logging.ERROR)

In [13]:
from pyspark.sql import SparkSession

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Preprocess") \
    .config("spark.executor.memory", "18g") \
    .config("spark.driver.memory", "150g") \
    .config("spark.pyspark.python", "/usr/local/bin/python3.9") \
    .config("spark.pyspark.driver.python", "/usr/local/bin/python3.9") \
    .getOrCreate()

# Read CSV into DataFrame
# prices = spark.read.csv("itineraries.csv", header=True, inferSchema=True)

In [14]:
flight_prices = spark.read.csv("/Users/shreyasmac/Documents/Notes/Big Data/Final Project/flight_prices_processed_2/*.csv", header=True, inferSchema=True)

                                                                                

In [15]:
flight_prices.columns

['startingAirport',
 'destinationAirport',
 'isRefundable',
 'isNonStop',
 'totalTravelDistance',
 'totalFare',
 'first_segmentsAirlineName',
 'searchWeek',
 'flightWeek',
 'travelDurationMinutes',
 'segmentsDepartureDaypart',
 'segmentArrivalDaypart']

In [16]:
from pyspark.sql.functions import mean
from pyspark.sql.functions import concat_ws


mean_weather_delay = flight_prices.groupBy('startingAirport', 'destinationAirport') \
                      .agg(mean('travelDurationMinutes').alias('Mean_travelDurationMinutes'))

# Concatenate 'Origin' and 'Dest' columns to create the key for the dictionary
mean_weather_delay = mean_weather_delay.withColumn("origin_dest", concat_ws("_", "startingAirport", "destinationAirport"))

# Convert the DataFrame to a dictionary
weather_delay_dict = dict(mean_weather_delay.rdd.map(lambda row: (row['origin_dest'], row['Mean_travelDurationMinutes'])).collect())

# Show the dictionary
print(weather_delay_dict)




{'JFK_ORD': 449.37553981502367, 'CLT_OAK': 846.5843123760835, 'MIA_DTW': 422.9290186473557, 'BOS_JFK': 139.59747316980818, 'CLT_MIA': 345.53223205145537, 'LAX_DFW': 350.92858345897616, 'DTW_PHL': 324.1143155059759, 'JFK_MIA': 296.6970743028995, 'OAK_CLT': 809.5669953997876, 'ATL_LGA': 268.63975140843763, 'BOS_DEN': 482.0833767930289, 'DEN_CLT': 393.74166266818025, 'ATL_ORD': 311.0551202349693, 'PHL_ATL': 275.19179602689934, 'DFW_ORD': 334.0808699069504, 'DEN_SFO': 332.029096047932, 'IAD_JFK': 423.0301650626594, 'LAX_MIA': 540.5833517107221, 'OAK_PHL': 825.6737556965419, 'PHL_IAD': 400.01414152174493, 'ATL_CLT': 266.3423335839404, 'ORD_OAK': 644.0546525281786, 'DFW_DEN': 255.6790295083012, 'DFW_IAD': 364.3484637266653, 'DTW_DEN': 405.81356115731114, 'CLT_DTW': 341.30855767203207, 'OAK_BOS': 774.998700994239, 'MIA_OAK': 903.0202757151902, 'CLT_IAD': 201.49525582275785, 'IAD_LAX': 541.5754021998669, 'JFK_SFO': 466.62619820127503, 'ORD_BOS': 234.52420656140646, 'OAK_MIA': 875.1978906087525

                                                                                

In [17]:
import json

# Convert the dictionary to JSON format
weather_delay_json = json.dumps(weather_delay_dict)

# Specify the file path where you want to save the JSON file
file_path = "timeDuration.json"  # Replace "/path/to/your/file.json" with your desired file path

# Write the JSON string to the file
with open(file_path, 'w') as f:
    json.dump(weather_delay_dict, f)

In [18]:
# flight_dict = flight_prices.rdd.map(lambda row: (row.Operating_Airline, row.Airline)).collectAsMap()

In [19]:
ohe_cols = ['startingAirport',
             'destinationAirport',             
             'first_segmentsAirlineName',
             'searchWeek',
             'flightWeek',
             'segmentsDepartureDaypart',
             'segmentArrivalDaypart']

In [20]:
ohe_cols

['startingAirport',
 'destinationAirport',
 'first_segmentsAirlineName',
 'searchWeek',
 'flightWeek',
 'segmentsDepartureDaypart',
 'segmentArrivalDaypart']

In [21]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

indexers = [StringIndexer(inputCol=column, outputCol=column+"_index", handleInvalid="keep") for column in ohe_cols]

# Perform OneHotEncoding on the indexed columns
encoders = [OneHotEncoder(inputCol=column+"_index", outputCol=column+"_onehot") for column in ohe_cols]

# Create a pipeline to execute the indexers and encoders sequentially
pipeline = Pipeline(stages=indexers + encoders)

# Fit the pipeline to the data and transform the DataFrame
pipeline_model = pipeline.fit(flight_prices)
encoded_df = pipeline_model.transform(flight_prices)

# Show the encoded DataFrame
encoded_df.limit(10).toPandas().head(10)


24/05/09 18:07:57 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


Unnamed: 0,startingAirport,destinationAirport,isRefundable,isNonStop,totalTravelDistance,totalFare,first_segmentsAirlineName,searchWeek,flightWeek,travelDurationMinutes,...,flightWeek_index,segmentsDepartureDaypart_index,segmentArrivalDaypart_index,startingAirport_onehot,destinationAirport_onehot,first_segmentsAirlineName_onehot,searchWeek_onehot,flightWeek_onehot,segmentsDepartureDaypart_onehot,segmentArrivalDaypart_onehot
0,ORD,MIA,0,0,1844,327.2,Delta Air Lines Inc.,16,21,740,...,21.0,0.0,1.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0)"
1,ORD,MIA,0,0,1824,328.6,Delta Air Lines Inc.,16,21,388,...,21.0,0.0,3.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 1.0)"
2,ORD,MIA,0,0,1196,328.6,Delta Air Lines Inc.,16,21,395,...,21.0,0.0,2.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 1.0, 0.0)"
3,ORD,MIA,0,0,1824,328.6,Delta Air Lines Inc.,16,21,405,...,21.0,0.0,2.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 1.0, 0.0)"
4,ORD,MIA,0,0,1393,328.6,Delta Air Lines Inc.,16,21,408,...,21.0,0.0,2.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 1.0, 0.0)"
5,ORD,MIA,0,0,1824,328.6,Delta Air Lines Inc.,16,21,543,...,21.0,0.0,3.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 1.0)"
6,ORD,MIA,0,1,1192,338.61,American Airlines Inc.,16,21,186,...,21.0,1.0,0.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0)"
7,ORD,MIA,0,0,1824,347.61,JetBlue Airways,16,21,382,...,21.0,0.0,2.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 1.0, 0.0)"
8,ORD,MIA,0,0,1824,347.61,JetBlue Airways,16,21,384,...,21.0,0.0,3.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 1.0)"
9,ORD,MIA,0,0,1824,347.61,JetBlue Airways,16,21,450,...,21.0,0.0,3.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 1.0)"


In [22]:
pipeline_model = pipeline.fit(flight_prices)

                                                                                

In [23]:
encoded_df.limit(10).toPandas().head(10)

Unnamed: 0,startingAirport,destinationAirport,isRefundable,isNonStop,totalTravelDistance,totalFare,first_segmentsAirlineName,searchWeek,flightWeek,travelDurationMinutes,...,flightWeek_index,segmentsDepartureDaypart_index,segmentArrivalDaypart_index,startingAirport_onehot,destinationAirport_onehot,first_segmentsAirlineName_onehot,searchWeek_onehot,flightWeek_onehot,segmentsDepartureDaypart_onehot,segmentArrivalDaypart_onehot
0,ORD,MIA,0,0,1844,327.2,Delta Air Lines Inc.,16,21,740,...,21.0,0.0,1.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0)"
1,ORD,MIA,0,0,1824,328.6,Delta Air Lines Inc.,16,21,388,...,21.0,0.0,3.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 1.0)"
2,ORD,MIA,0,0,1196,328.6,Delta Air Lines Inc.,16,21,395,...,21.0,0.0,2.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 1.0, 0.0)"
3,ORD,MIA,0,0,1824,328.6,Delta Air Lines Inc.,16,21,405,...,21.0,0.0,2.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 1.0, 0.0)"
4,ORD,MIA,0,0,1393,328.6,Delta Air Lines Inc.,16,21,408,...,21.0,0.0,2.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 1.0, 0.0)"
5,ORD,MIA,0,0,1824,328.6,Delta Air Lines Inc.,16,21,543,...,21.0,0.0,3.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 1.0)"
6,ORD,MIA,0,1,1192,338.61,American Airlines Inc.,16,21,186,...,21.0,1.0,0.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0)"
7,ORD,MIA,0,0,1824,347.61,JetBlue Airways,16,21,382,...,21.0,0.0,2.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 1.0, 0.0)"
8,ORD,MIA,0,0,1824,347.61,JetBlue Airways,16,21,384,...,21.0,0.0,3.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 1.0)"
9,ORD,MIA,0,0,1824,347.61,JetBlue Airways,16,21,450,...,21.0,0.0,3.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 1.0)"


In [24]:
columns = encoded_df.columns

# Reorder the columns putting the 5th column at the end
new_columns = columns[:5] + columns[6:] + [columns[5]]

# Select the columns in the new order
encoded_df = encoded_df.select(*new_columns)

In [25]:
from pyspark.sql.functions import col
encoded_df.filter(col("startingAirport") == "JFK").filter(col("destinationAirport")=="BOS").show(2)

+---------------+------------------+------------+---------+-------------------+-------------------------+----------+----------+---------------------+------------------------+---------------------+---------------------+------------------------+-------------------------------+----------------+----------------+------------------------------+---------------------------+----------------------+-------------------------+--------------------------------+-----------------+-----------------+-------------------------------+----------------------------+---------+
|startingAirport|destinationAirport|isRefundable|isNonStop|totalTravelDistance|first_segmentsAirlineName|searchWeek|flightWeek|travelDurationMinutes|segmentsDepartureDaypart|segmentArrivalDaypart|startingAirport_index|destinationAirport_index|first_segmentsAirlineName_index|searchWeek_index|flightWeek_index|segmentsDepartureDaypart_index|segmentArrivalDaypart_index|startingAirport_onehot|destinationAirport_onehot|first_segmentsAirlineName

In [26]:
encoded_df.columns

['startingAirport',
 'destinationAirport',
 'isRefundable',
 'isNonStop',
 'totalTravelDistance',
 'first_segmentsAirlineName',
 'searchWeek',
 'flightWeek',
 'travelDurationMinutes',
 'segmentsDepartureDaypart',
 'segmentArrivalDaypart',
 'startingAirport_index',
 'destinationAirport_index',
 'first_segmentsAirlineName_index',
 'searchWeek_index',
 'flightWeek_index',
 'segmentsDepartureDaypart_index',
 'segmentArrivalDaypart_index',
 'startingAirport_onehot',
 'destinationAirport_onehot',
 'first_segmentsAirlineName_onehot',
 'searchWeek_onehot',
 'flightWeek_onehot',
 'segmentsDepartureDaypart_onehot',
 'segmentArrivalDaypart_onehot',
 'totalFare']

In [27]:
columns_to_drop = [
    "startingAirport",
    "destinationAirport",
    "first_segmentsAirlineName",
    "searchWeek",
    "flightWeek",
    "segmentsDepartureDaypart",
    "segmentArrivalDaypart",
    "isNonStop",
    "isRefundable"
]

# Drop the specified columns
encoded_df = encoded_df.drop(*columns_to_drop)

In [28]:
train_df, test_df = encoded_df.randomSplit([0.7, 0.3])

In [30]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Combine features into a single vector column
trainAssembler = VectorAssembler(inputCols=train_df.columns[:-1],
                                 outputCol="features")
trainOutput = trainAssembler.transform(train_df)

testAssembler = VectorAssembler(inputCols=test_df.columns[:-1],
                                outputCol="features")
testOutput = testAssembler.transform(test_df)

# Create a LinearRegression model
lr = LinearRegression(featuresCol="features", labelCol="totalFare")

# Train the model
model = lr.fit(trainOutput)

# Make predictions
predictions = model.transform(testOutput)


24/05/09 18:11:23 WARN Instrumentation: [ed7dc871] regParam is zero, which might cause numerical instability and overfitting.
24/05/09 18:12:05 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/05/09 18:12:05 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
24/05/09 18:18:29 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK
24/05/09 18:18:29 WARN Instrumentation: [ed7dc871] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
                                                                                

In [31]:
from pyspark.sql.functions import abs

# Evaluate the model
mae = predictions.select(abs(predictions["totalFare"] - predictions["prediction"]).alias("error")).agg({"error": "mean"}).collect()[0][0]
print("Mean Absolute Error (MAE) on test data = %g" % mae)



Mean Absolute Error (MAE) on test data = 103.039


                                                                                

In [None]:
from pyspark.sql.functions import abs

# Evaluate the model
evaluator = RegressionEvaluator(labelCol="totalFare", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
mae = predictions.select(abs(predictions["totalFare"] - predictions["prediction"]).alias("error")).agg({"error": "mean"}).collect()[0][0]

print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)
print("Mean Absolute Error (MAE) on test data = %g" % mae)

In [None]:
predictions

In [None]:
# Save the pipeline
# pipeline_path = "pipeline_model2"
# pipeline_model.save(pipeline_path)

# Save the trained model
# model_path = "linear_regression_model"
# model.save(model_path)


In [None]:
# model_path = "linear_regression_model"
# model.save(model_path)

# print("Model saved to", model_path)

In [22]:
import random
print(random.uniform(-1, 1))

0.3798136523647071
