In [1]:
import findspark
findspark.init()

import pyspark
import random

In [2]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.session import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import isnan, when, count, col

In [3]:
filename = "2004.csv"

In [4]:
def _init_spark():
    spark = SparkSession.builder.appName("Project").getOrCreate()
    sc = spark.sparkContext
    return spark, sc

In [5]:
spark, sc = _init_spark()

In [6]:
sqlContext = SQLContext(sc)

df = sqlContext.read.load(filename, 
                      format='com.databricks.spark.csv', 
                      header='true',
                      delimiter=',',
                      inferSchema='true')
df.cache()

DataFrame[Year: int, Month: int, DayofMonth: int, DayOfWeek: int, DepTime: string, CRSDepTime: int, ArrTime: string, CRSArrTime: int, UniqueCarrier: string, FlightNum: int, TailNum: string, ActualElapsedTime: string, CRSElapsedTime: int, AirTime: string, ArrDelay: string, DepDelay: string, Origin: string, Dest: string, Distance: int, TaxiIn: int, TaxiOut: int, Cancelled: int, CancellationCode: string, Diverted: int, CarrierDelay: int, WeatherDelay: int, NASDelay: int, SecurityDelay: int, LateAircraftDelay: int]

In [None]:
# removing as is stated in the task along with the 'Year'
col_to_drop = ['ArrTime', 'ActualElapsedTime', 'AirTime', 'TaxiIn', 'Diverted', 
               'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay', 'Year']
df = df.drop(*col_to_drop)

In [None]:
previous_amount = df.count()

In [None]:
# "CancelationCode" has too much "null" (98% of the data) we will remove it too. Others have no missing values except for "TailNum", that has only 127 values left.  
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).show()

In [None]:
# deletion of the "CancelationCode" and droping rows that contain "TailNum"
df = df.drop('CancellationCode')
df = df.filter(df.TailNum.isNotNull() )

In [None]:
# Percentage of data saved dropping rows with null in the "TailNum" column
df.count()/previous_amount

In [None]:
df.printSchema()

In [None]:
# "ArrDelay" and "DepDelay" have string type. We cast them to Integer
df = df.withColumn("ArrDelay", df["ArrDelay"].cast(IntegerType()))
df = df.withColumn("DepDelay", df["DepDelay"].cast(IntegerType()))
df.printSchema()

In [None]:
df.printSchema()

In [None]:
df.filter(df)

In [None]:
corr_matrix = df.select([x[0] for x in df.dtypes if 'int' in x])

In [None]:
corr_matrix.show(5)

In [None]:
# I guess it is too pythonic and we nees to change it's PEARSON CORRELATION

[df.corr("ArrDelay", c[0]) for c in corr_matrix.dtypes]

In [None]:
NON_corr_matrix = df.select([x[0] for x in df.dtypes if x[1] !='int']).show(5)
NON_corr_matrix

In [None]:
# in order to visualize data, it has to be transformed in Pandas
#unfortunately, our dataset is too large, therefore we only have to get a sample
# in this case we only get 25% of our data, with no replacement


import pandas as pd
dfPanda = df.sample(False, 0.25, 42).toPandas()



In [None]:
#I will be using Altair for visualization, which accepts only 5000 max observations
#from here we can tell what airports have the longest trips

import altair as alt

dfPanda = dfPanda.sample(n=5000, random_state=1)
alt.Chart(dfPanda).mark_point().encode(
    x='Distance',
    y='Origin',
    color='DayOfWeek',
)

