In [2]:
import os
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, DoubleType, TimestampType
from pyspark.sql.functions import col, to_date, concat, lit
os.environ["SPARK_HOME"] = "/home/mate/.local/lib/python3.10/site-packages/pyspark/"
os.environ["PYSPARK_DRIVER_PYTHON"] = "jupyter"
os.environ["PYSPARK_DRIVER_PYTHON_OPTS"] = "notebook"


In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local") \
    .appName("My Spark Application") \
    .config("spark.executor.memory", "6g") \
    .getOrCreate()


24/05/08 11:19:35 WARN Utils: Your hostname, ces-shrd-1 resolves to a loopback address: 127.0.1.1; using 192.168.1.25 instead (on interface wlp0s20f3)
24/05/08 11:19:35 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/08 11:19:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/05/08 11:19:36 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/05/08 11:19:36 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [4]:
from pyspark.sql.functions import col, dayofweek,to_date, month, count, avg
from pyspark.sql import Window
from pyspark.sql.functions import row_number,   sum, when


# Load the CSV file into a DataFrame
csv_file_path_flightdelay = "./full_data_flightdelay.csv"  # Replace with the path to your CSV file


df_flightdelay = spark.read.option("delimiter", ",").option("header", "true").csv(csv_file_path_flightdelay)


# Read the CSV file using the manually defined schema
csv_file_path_weather = "./airport_weather_2019.csv"  # Replace with your file path
df_weather = spark.read.option("delimiter", ",").option("header", "true").csv(csv_file_path_weather)




In [5]:
df_weather.select('DATE').distinct().count()

730

In [6]:
# ----------------------------
# Data cleanup and preparation
# ----------------------------

In [7]:
from pyspark.sql.functions import coalesce

# create new column for month and day_of_week values derived from date
df_day_column = df_weather.withColumn("DATE_NEW", to_date(col("DATE"), "M/d/yyyy"))
df_day_column = df_day_column.withColumn("DATE_NEW", coalesce(df_day_column["DATE_NEW"], to_date(df_day_column["DATE"], 'yyyy-MM-dd')))

df_day_column = df_day_column.withColumn("DAY_OF_WEEK", dayofweek(col("DATE_NEW").alias("DAY_OF_WEEK")))
df_day_column = df_day_column.withColumn("MONTH", month(col("DATE_NEW").alias("MONTH")))

df_day_column.createOrReplaceTempView("table1")
df_select = spark.sql("SELECT STATION, NAME,DAY_OF_WEEK,DATE, MONTH, AWND, PRCP, SNOW, SNWD, TAVG, TMAX, TMIN, WDF2 from table1")
#df_select.show(n=5)

grouped_df = df_select.groupBy("MONTH", "NAME").agg(
    avg("AWND").alias("AWND"),
    avg("PRCP").alias("PRCP"),
    avg("SNOW").alias("SNOW"),
    avg("SNWD").alias("SNWD"),
    avg("TAVG").alias("TAVG"),
    avg("TMAX").alias("TMAX"),
    avg("TMIN").alias("TMIN"),
    avg("WDF2").alias("WDF2")
).orderBy("NAME","MONTH")


#grouped_df.show(n=20)

24/05/08 11:19:40 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [8]:
from pyspark.sql.functions import lower, split, col, lit, monotonically_increasing_id


# Normalize joining columns
grouped_df = grouped_df.withColumn("normalized_name", lower(col("name")))
df_flightdelay = df_flightdelay.withColumn("normalized_name", lower(split(col("departing_airport"), " ").getItem(0)))

# Group by to investigate
grouped_df_nn = grouped_df.groupBy("normalized_name").agg(
    count('*').alias('count')
)

grouped_df_name = grouped_df.groupBy("NAME").agg(
    count('*').alias('count')
)

In [9]:
# For 'grouped_df', transforming 'NAME' to lowercase and dropping duplicates based on the 'name' column
grouped_df_lower = grouped_df.select(lower(col("NAME")).alias("name")).dropDuplicates(['name'])

# For 'df_flightdelay', transforming 'DEPARTING_AIRPORT' to lowercase, casting it to string, and dropping duplicates based on the 'departing_airport' column
df_flightdelay_lower = df_flightdelay.select(lower(col("DEPARTING_AIRPORT")).alias("departing_airport")).dropDuplicates(['departing_airport'])



#join providing table that contain in the name column all distinct airports 
#from weather dataset and under departing_flight all distinc airports from delay dataset

result_df = df_flightdelay_lower.alias("flight").join(
    grouped_df_lower.alias("grouped"),
    (col("grouped.name").contains(col("flight.departing_airport"))),
    "inner"
).select(
    col("flight.departing_airport").alias("departing_airport"),
    col("grouped.name").alias("name")
)

#result_df.show(n=2)




In [10]:
#modify dataframe such that df_result will contain the airports matched on join 
#and enhanced results will contain the df of unmatched airports for each dataset

# Identifying non-matched entries
non_matched_flight = df_flightdelay_lower.alias("flight").join(
    result_df.alias("result"),
    result_df.departing_airport == df_flightdelay_lower.departing_airport,
    "left_anti"
)

non_matched_grouped = grouped_df_lower.alias("grouped").join(
    result_df.alias("result"),
    result_df.name == grouped_df_lower.name,
    "left_anti"
)


# Add a unique ID to each DataFrame to facilitate the outer join
result_df = result_df.withColumn("id", monotonically_increasing_id())
non_matched_flight = non_matched_flight.withColumn("id", monotonically_increasing_id())
non_matched_grouped = non_matched_grouped.withColumn("id", monotonically_increasing_id())


# Perform the outer joins using the unique IDs, result_df is now composed of matched airports
enhanced_result_df = result_df.join(non_matched_flight, "id", "outer" ).join(non_matched_grouped, "id", "outer" )
enhanced_result_df = enhanced_result_df.drop("id")

# Show the enhanced DataFrame with additional columns
#enhanced_result_df.printSchema()

# Select columns, get rid of duplicates
selected_columns = [col for col in enhanced_result_df.columns if col != 'name' and col != 'departing_airport'] + ['grouped.name'] + ['flight.departing_airport']

#will contain unmatched airports for each dataset
enhanced_result_df = enhanced_result_df.select(selected_columns)
enhanced_result_df.drop('name','departing_airport')
#enhanced_result_df.show(n=2)


DataFrame[]

In [11]:
# create dataframe that contains airports matched and unmatched result from the join
# Rename columns in result_df
result_df = result_df.withColumnRenamed("name", "weather_matched") \
                     .withColumnRenamed("departing_airport", "delay_matched")

# Rename columns in enhanced_result_df
enhanced_result_df = enhanced_result_df.withColumnRenamed("name", "weather_unmatched") \
                                       .withColumnRenamed("departing_airport", "delay_unmatched")

# Optional: If you need to ensure the rows are matched by order, add an index column to each DataFrame
result_df = result_df.withColumn("index", monotonically_increasing_id())
enhanced_result_df = enhanced_result_df.withColumn("index", monotonically_increasing_id())

# Join DataFrames on the index column
matched_and_unmatched_airports = result_df.join(
    enhanced_result_df,
    on="index",
    how="outer"  # Use "outer" to include all rows from both DataFrames
)

# Drop the index column as it's no longer needed after joining
matched_and_unmatched_airports = matched_and_unmatched_airports.drop("index", 'name', 'departing_airport')


In [12]:
#count the amount of airports for each clumn in the enhanced dataset dataframe 

#non_null_name_count = result_df.filter(col("name").isNotNull()).count()
#non_null_name_count1 = result_df.filter(col("departing_airport").isNotNull()).count()
#non_null_name_count2 = enhanced_result_df.filter(col("weather_unmatched").isNotNull()).count()
#non_null_name_count3 = enhanced_result_df.filter(col("delay_unmatched").isNotNull()).count()
#print("Number of non-null strings in the 'name' column:", non_null_name_count, non_null_name_count1,non_null_name_count2, non_null_name_count3)

# Display the filtered DataFrame and print the counts



In [13]:
import pandas as pd

# Initialize a list to store the parsed data
data = []

# Open the text file and parse it line by line
with open('./airports.txt', 'r') as file:
    for line in file:
        # Split the line by comma to extract the needed parts
        parts = line.split(',')
        
        # Check if the line has enough parts to avoid index errors
        if len(parts) >= 4:
            # Extract and clean the desired parts
            # Remove quotation marks and extra spaces if present
            name = parts[1].strip('"').strip()
            city = parts[2].strip('"').strip()
            country = parts[3].strip('"').strip()
            
            # Combine the first two parts into one column, and keep the country as the second column
            combined = f"{name}, {city}, {country}"
            data.append(combined)

# Create a DataFrame from the list
df_airports = pd.DataFrame(data, columns=['Airport and City'])


In [14]:
#!pip install fuzzywuzzy
#!pip install python-Levenshtein

In [15]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

# Get all airport names into one table
df_delay_unique_airport = df_flightdelay.select('DEPARTING_AIRPORT').distinct().withColumnRenamed("DEPARTING_AIRPORT", "airport")
df_weather_unique_airport = df_weather.select('NAME').distinct().withColumnRenamed("NAME", "airport")

df_union = df_delay_unique_airport.union(df_weather_unique_airport)

def filter_out_useless_parts_of_string(airport_name):
    useless_words = ["international", "airport", "regional"]

    modified = airport_name.lower()
    for word in useless_words:
        modified = modified.replace(word,"")
        
    return modified
    
filter_string_udf = udf(filter_out_useless_parts_of_string, StringType())

df_union = df_union.select(filter_string_udf(col('airport'))).withColumnRenamed("filter_out_useless_parts_of_string(airport)", "airport")
#df_union.show(n=5)



In [16]:
from fuzzywuzzy import process, fuzz

def get_matches(df1, col1, df2, col2, threshold=40):
    # Convert each column to a list for processing, ensuring to drop NA values
    list1 = df1[col1].dropna().tolist()
    list2 = df2[col2].dropna().tolist()

    # Find best matches with a score above the threshold
    matches = []
    for item in list1:
        # Use process.extractOne to find the best match for each item from list1 in list2
        best_match = process.extractOne(item, list2, scorer=fuzz.token_set_ratio)
        if best_match and best_match[1] >= threshold:
            matches.append((item, best_match[0], best_match[1]))

    # Return matches as a DataFrame for better visualization
    return pd.DataFrame(matches, columns=[col1, col2 + '_match', 'Score'])

In [17]:
# Add official airports using fuzzywuzzy
# Assuming 'matched_and_unmatched_airports' is your PySpark DataFrame
pandas_df = df_union.toPandas()  # Convert to Pandas DataFrame

# Example usage (ensure df1 and df2 are already defined and loaded with your data)
df_matches_airports = get_matches(pandas_df, 'airport', df_airports, 'Airport and City')

spark_df_airports = spark.createDataFrame(df_matches_airports)
# Show the DataFrame to verify conversion


                                                                                

In [18]:
joining_table = spark_df_airports.select("airport", "Airport and City_match")
#joining_table.show(n=2)
delay_table = df_flightdelay.withColumn("DEPARTING_AIRPORT", filter_string_udf('DEPARTING_AIRPORT'))
#delay_table.show(n=2)
weather_table = df_day_column.withColumn("NAME", filter_string_udf('NAME'))
#weather_table.show(n=2)

#previous_arprt_table = df_flightdelay.withColumn("PREVIOUS_AIRPORT", filter_string_udf('PREVIOUS_AIRPORT'))
#previous_arprt_joined = previous_arprt_table.join(joining_table, joining_table.airport == previous_arprt_table.PREVIOUS_AIRPORT, 'inner')

delay_joined = delay_table.join(joining_table, joining_table.airport == delay_table.DEPARTING_AIRPORT, 'inner')
weather_joined = weather_table.join(joining_table, joining_table.airport == weather_table.NAME, 'inner')

In [19]:



#delay_joined.show(n=3)
#weather_joined.show(n=3)

# Convert integer columns in df1 to strings
weather_joined = weather_joined.withColumn("MONTH", col("MONTH").cast("string")) \
         .withColumn("DAY_OF_WEEK", col("DAY_OF_WEEK").cast("string"))

result_joined = delay_joined.join(weather_joined, ["MONTH","DAY_OF_WEEK","Airport and City_match"], 'inner')

#result_joined.show(n=2)



In [20]:
# Drop unnecessary columns
# LATITUDE, LONGITUDE, STATION, MONTH, airport, normalized_name, NAME, _c0
result_joined = result_joined.drop("LATITUDE", "LONGITUDE", "STATION", "MONTH", \
                                   "airport", "normalized_name", "NAME", "_c0", \
                                   "DATE", "AIRLINE_FLIGHTS_MONTH", "AVG_MONTHLY_PASS_AIRLINE" \
                                   "DEPARTING_AIRPORT", "PGTM", "WDF5", "WDF2", "WSF2", "WSF5", \
                                   "SN32", "SX32", "TOBS","WESD", "PSUN","TSUN")

df = result_joined
#result_joined.count()
#result_joined.select("CONCURRENT_FLIGHTS").filter(col("CONCURRENT_FLIGHTS").isNotNull()).count()

In [21]:
df = df.withColumn("TMIN", df["TMIN"].cast("float"))
df = df.withColumn("PRCP", df["PRCP"].cast("float"))


df = df.withColumn("SNOW", when(col("TMIN") > 3, 0).otherwise(col("SNOW")))
df = df.withColumn("SNOW", when( \
                    (col("SNOW").isNull() | (col("SNOW") == '')) & (col("PRCP") > 0), col("PRCP") \
                               ).otherwise(lit(0)))


In [22]:
# Simplify temperatures, create new flag column EXTREME_WEATHER based on TMIN and TMAX and drop all others


# Since TMAX and TMIN are strings, you need to convert them to integers before comparison
df = df.withColumn("TMAX", df["TMAX"].cast("integer"))
df = df.withColumn("TMIN", df["TMIN"].cast("integer"))

# Creating the EXTREME_WEATHER column based on the conditions provided
df = df.withColumn("EXTREME_WEATHER", 
                   when((col("TMAX") > 40) | (col("TMIN") < 0), 1)
                   .otherwise(0))

df = df.drop("TMIN", "TMAX", "TAVG")

In [23]:
from pyspark.sql.functions import expr

# Replace all WT** with column which adds these extreme weather conditions into one value
values_as_strings = [f"WT{i:02}" for i in range(1, 12)]
#print(values_as_strings)
#print("+".join(values_as_strings))

for column_name in values_as_strings:
    df = df.withColumn(column_name, df[column_name].cast("integer"))

In [24]:
#df.printSchema()


In [25]:
from functools import reduce

total_wt_column = reduce(lambda a, b: a + b, [coalesce(col(c), lit(0)) for c in values_as_strings])

df = df.withColumn('EXTREME_WEATHER_WT', total_wt_column)

df = df.drop('WT01', 'WT02', 'WT03', 'WT04', 'WT05', 'WT06', 'WT07', 'WT08', 'WT09', 'WT10', 'WT11')



In [26]:
df.printSchema()

root
 |-- DAY_OF_WEEK: string (nullable = true)
 |-- Airport and City_match: string (nullable = true)
 |-- DEP_DEL15: string (nullable = true)
 |-- DEP_TIME_BLK: string (nullable = true)
 |-- DISTANCE_GROUP: string (nullable = true)
 |-- SEGMENT_NUMBER: string (nullable = true)
 |-- CONCURRENT_FLIGHTS: string (nullable = true)
 |-- NUMBER_OF_SEATS: string (nullable = true)
 |-- CARRIER_NAME: string (nullable = true)
 |-- AIRPORT_FLIGHTS_MONTH: string (nullable = true)
 |-- AIRLINE_AIRPORT_FLIGHTS_MONTH: string (nullable = true)
 |-- AVG_MONTHLY_PASS_AIRPORT: string (nullable = true)
 |-- AVG_MONTHLY_PASS_AIRLINE: string (nullable = true)
 |-- FLT_ATTENDANTS_PER_PASS: string (nullable = true)
 |-- GROUND_SERV_PER_PASS: string (nullable = true)
 |-- PLANE_AGE: string (nullable = true)
 |-- DEPARTING_AIRPORT: string (nullable = true)
 |-- PREVIOUS_AIRPORT: string (nullable = true)
 |-- AWND: string (nullable = true)
 |-- PRCP: float (nullable = true)
 |-- SNOW: float (nullable = true)
 |-

In [27]:
from pyspark.sql.functions import col, when, count

# Define a function to count nulls and empty strings
def count_nulls_and_empties(df):
    # Use aggregation to sum up each condition of being null or empty across all columns
    exprs = [count(when(df[c].isNull() | (df[c] == ""), c)).alias(c) for c in df.columns]
    return df.agg(*exprs)


In [28]:
# Fill in missing values, for example AWND and PRCP

from pyspark.sql.functions import avg, col, coalesce, month, median
from pyspark.sql.window import Window

# Define a window spec partitioned by month
window_spec = Window.partitionBy(month("DATE_NEW"))

# Assuming 'column_name' is the column with null values you want to fill
avg_column = avg(col("AWND")).over(window_spec)
avg_prcp = avg(col("PRCP")).over(window_spec)
med_column = median(col("FLT_ATTENDANTS_PER_PASS")).over(window_spec)


# Replace nulls with the average of that month
df = df.withColumn("AWND_filled", coalesce(col("AWND"), avg_column))
df = df.withColumn("PRCP_filled", coalesce(col("PRCP"), avg_prcp))
df = df.withColumn("DISTANCE_GROUP_filled", coalesce(col("DISTANCE_GROUP"), lit("1")))
df = df.withColumn("FLT_ATTENDANTS_PER_PASS_filled", coalesce(col("FLT_ATTENDANTS_PER_PASS"), med_column))


df = df.drop("AWND").withColumnRenamed("AWND_filled", "AWND")
df = df.drop("PRCP").withColumnRenamed("PRCP_filled", "PRCP")
df = df.drop("DISTANCE_GROUP").withColumnRenamed("DISTANCE_GROUP_filled", "DISTANCE_GROUP")
df = df.drop("FLT_ATTENDANTS_PER_PASS").withColumnRenamed("FLT_ATTENDANTS_PER_PASS_filled", "FLT_ATTENDANTS_PER_PASS")

missing_values_df = df


In [29]:
df.printSchema()

root
 |-- DAY_OF_WEEK: string (nullable = true)
 |-- Airport and City_match: string (nullable = true)
 |-- DEP_DEL15: string (nullable = true)
 |-- DEP_TIME_BLK: string (nullable = true)
 |-- SEGMENT_NUMBER: string (nullable = true)
 |-- CONCURRENT_FLIGHTS: string (nullable = true)
 |-- NUMBER_OF_SEATS: string (nullable = true)
 |-- CARRIER_NAME: string (nullable = true)
 |-- AIRPORT_FLIGHTS_MONTH: string (nullable = true)
 |-- AIRLINE_AIRPORT_FLIGHTS_MONTH: string (nullable = true)
 |-- AVG_MONTHLY_PASS_AIRPORT: string (nullable = true)
 |-- AVG_MONTHLY_PASS_AIRLINE: string (nullable = true)
 |-- GROUND_SERV_PER_PASS: string (nullable = true)
 |-- PLANE_AGE: string (nullable = true)
 |-- DEPARTING_AIRPORT: string (nullable = true)
 |-- PREVIOUS_AIRPORT: string (nullable = true)
 |-- SNOW: float (nullable = true)
 |-- SNWD: string (nullable = true)
 |-- DATE_NEW: date (nullable = true)
 |-- EXTREME_WEATHER: integer (nullable = false)
 |-- EXTREME_WEATHER_WT: integer (nullable = false)


In [30]:
from pyspark.sql.functions import when, col

# Replace null values in the 'PREVIOUS_AIRPORT' column with empty strings
df = df.withColumn('PREVIOUS_AIRPORT', when(col('PREVIOUS_AIRPORT').isNull(), '').otherwise(col('PREVIOUS_AIRPORT')))
#df.show(n=2)
#nulls_and_empties_count = count_nulls_and_empties(df)

#nulls_and_empties_count.show()

In [31]:
from pyspark.sql.functions import round

calculate_flights = df.select("CARRIER_NAME", "Airport and City_match", "DATE_NEW")

calculate_flights = calculate_flights.withColumn("MONTH", month(col("DATE_NEW").alias("MONTH")))

count = calculate_flights.groupBy("CARRIER_NAME", "Airport and City_match", "MONTH").count()
#print(count.show())

count = count.groupBy("CARRIER_NAME", "Airport and City_match") \
                   .agg(round(avg("count")).alias("monthly_avg_count"))
#print(count.show())

In [32]:
rsdf = df.join(
    count,
    ["CARRIER_NAME", "Airport and City_match"],
    "inner"
)

rsdf = rsdf.withColumn("AIRLINE_AIRPORT_FLIGHTS_MONTH",\
                       when(\
                           (col("AIRLINE_AIRPORT_FLIGHTS_MONTH").isNull() | (col("AIRLINE_AIRPORT_FLIGHTS_MONTH") == ''))
                           , col("monthly_avg_count")).otherwise(col("AIRLINE_AIRPORT_FLIGHTS_MONTH")))

rsdf = rsdf.drop("monthly_avg_count")

#rsdf.show()


In [33]:
df = rsdf

#nulls_and_empties_count = count_nulls_and_empties(df)

#nulls_and_empties_count.show(n=2)

In [34]:
# Use aggregation to sum up each condition of being null or empty across all columns
#exprs = [count(when(df[c].isNull() | (df[c] == ""), c)).alias(c) for c in df.columns]
#df.agg(*exprs).show()



In [35]:
# ------------------------------------
# Converting from numerical to nominal
# ------------------------------------

In [36]:
# Convert precipitation from numerical to nominal

# Calculate the quantile thresholds
#thresholds = result_df.approxQuantile("PRCP", [0.33, 0.67], 0.01)  # 0.01 is the relative error

# Categorize based on quantile thresholds
#result_df = result_df.withColumn(
#    "precip_category",
#    when(col("PRCP") <= thresholds[0], "low")
#    .when(col("PRCP") <= thresholds[1], "medium")
#    .otherwise("high")
#)

# Show the resulting DataFrame
#result_df.select("PRCP", "precip_category").show()

In [37]:
# Transform weekday from numerical to nominal

# Weekday mapping dictionary
month_dict = {
    '1': 'Monday', '2': 'Tuesday', '3': 'Wednesday', '4': 'Thursday', 
    '5': 'Friday', '6': 'Saturday', '7': 'Sunday'}

# Define the UDF to convert numerical months to names
def convert_weekday_to_name(weekday):
    return month_dict.get(str(weekday), "Unknown")

convert_weekday_udf = udf(convert_weekday_to_name, StringType())

# Apply the UDF to create a new column with month names
#df_with_months = result_df.withColumn("DAY_OF_WEEK_NAME", convert_weekday_udf(result_df["DAY_OF_WEEK"]))
#df_with_months.show(n=1)
#df_with_months = df_with_months.drop("DAY_OF_WEEK").withColumnRenamed("DAY_OF_WEEK_NAME", "DAY_OF_WEEK")

In [38]:
# NUMBER_OF_SEATS into nominal

# Categorize based on research
df = df.withColumn(
    "NUMBER_OF_SEATS_NOM",
    when(col("NUMBER_OF_SEATS") <= 100, "Small")
    .when(col("NUMBER_OF_SEATS") <= 200, "Medium")
    .when(col("NUMBER_OF_SEATS") <= 400, "Large")
    .otherwise("Jumbo")
)
# Replace NUMBER_OF_SEATS column with the nominal one
df = df.drop("NUMBER_OF_SEATS").withColumnRenamed("NUMBER_OF_SEATS_NOM", "NUMBER_OF_SEATS")

In [39]:
# Plane age into nominal

# Categorize based on research
df = df.withColumn(
    "PLANE_AGE_NOM",
    when(col("PLANE_AGE") <= 10, "New")
    .when(col("PLANE_AGE") <= 20, "Standard")
    .otherwise("Old")
)

# Replace PLANE_AGE column with the nominal one
df = df.drop("PLANE_AGE").withColumnRenamed("PLANE_AGE_NOM", "PLANE_AGE")


this_df = df

In [40]:

df.printSchema()

root
 |-- CARRIER_NAME: string (nullable = true)
 |-- Airport and City_match: string (nullable = true)
 |-- DAY_OF_WEEK: string (nullable = true)
 |-- DEP_DEL15: string (nullable = true)
 |-- DEP_TIME_BLK: string (nullable = true)
 |-- SEGMENT_NUMBER: string (nullable = true)
 |-- CONCURRENT_FLIGHTS: string (nullable = true)
 |-- AIRPORT_FLIGHTS_MONTH: string (nullable = true)
 |-- AIRLINE_AIRPORT_FLIGHTS_MONTH: string (nullable = true)
 |-- AVG_MONTHLY_PASS_AIRPORT: string (nullable = true)
 |-- AVG_MONTHLY_PASS_AIRLINE: string (nullable = true)
 |-- GROUND_SERV_PER_PASS: string (nullable = true)
 |-- DEPARTING_AIRPORT: string (nullable = true)
 |-- PREVIOUS_AIRPORT: string (nullable = true)
 |-- SNOW: float (nullable = true)
 |-- SNWD: string (nullable = true)
 |-- DATE_NEW: date (nullable = true)
 |-- EXTREME_WEATHER: integer (nullable = false)
 |-- EXTREME_WEATHER_WT: integer (nullable = false)
 |-- AWND: string (nullable = true)
 |-- PRCP: double (nullable = true)
 |-- DISTANCE_GR

# Save cleaned and prepared data

In [41]:
# Save cleaned and prepared data file to a csv
df.write.csv('cleaned_flight_data.csv', header=True)