In [1]:
!pip install pyspark
!pip install findspark



In [2]:
import findspark
findspark.init()

In [3]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

In [4]:
# Criar o contexto do spark
sc = SparkContext()

# Instancia o criador de sessao do spark
spark = (SparkSession.builder
                     .master("local[7]")
                     .appName("Aceleração PySpark - Capgemini"))

In [5]:
df_airports_quality = (spark.getOrCreate().read
                  .format("parquet")
                  .option("header", "true")                  
                  .load("airports_qa.parquet"))

df_planes_quality = (spark.getOrCreate().read
                  .format("parquet")
                  .option("header", "true")                  
                  .load("planes_qa.parquet"))

df_flights_quality = (spark.getOrCreate().read
                  .format("parquet")
                  .option("header", "true")                  
                  .load("flights_qa.parquet"))

df_airports_proc = (spark.getOrCreate().read
                  .format("parquet")
                  .option("header", "true")                  
                  .load("airports_proc.parquet"))

df_planes_proc = (spark.getOrCreate().read
                  .format("parquet")
                  .option("header", "true")                  
                  .load("planes_proc.parquet"))

df_flights_proc = (spark.getOrCreate().read
                  .format("parquet")
                  .option("header", "true")                  
                  .load("flights_proc.parquet"))


# Perguntas para Qualidade

## Pergunta 1

In [6]:
# Renomeando as colunas que geram duplicações de nome

df_airports_quality_original = df_airports_quality

for i in df_airports_quality.columns:
    df_airports_quality = df_airports_quality.withColumnRenamed(i,i+"_origin")

df_planes_quality = (df_planes_quality.withColumnRenamed('qa_tailnum', 'qa_tailnum_planes')
                                      .withColumnRenamed('tailnum', 'tailnum_planes'))

df_flights_quality = (df_flights_quality.withColumnRenamed('qa_tailnum', 'qa_tailnum_flights')
                                        .withColumnRenamed('tailnum', 'tailnum_flights'))


In [7]:
# Primeiro join

df_quality = df_flights_quality.join(df_airports_quality, (df_airports_quality.faa_origin == df_flights_quality.origin), "left")

df_quality.toPandas()

Unnamed: 0,tailnum_flights,dest,origin,qa_tailnum_flights,qa_year_month_day,qa_hour_minute,qa_dep_arr,qa_dep_arr_delay,qa_carrier,qa_flight,...,qa_distance,qa_distance_airtime,faa_origin,qa_faa_origin,qa_name_origin,qa_lat_origin,qa_lon_origin,qa_alt_origin,qa_tz_origin,qa_dst_origin
0,N846VA,LAX,SEA,,,,,,,,...,,TL,SEA,,,,,,,
1,N559AS,HNL,SEA,,,,,,,F,...,,TL,SEA,,,,,,,
2,N847VA,SFO,SEA,,,,,,,F,...,,TL,SEA,,,,,,,
3,N360SW,SJC,PDX,,,,,,,F,...,,TR,PDX,,,,,,,
4,N612AS,BUR,SEA,,,,,,,F,...,,TL,SEA,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,N225AG,SLC,SEA,,,,,,,,...,,TR,SEA,,,,,,,
9996,N3LEAA,DFW,SEA,F,,,,,,,...,,TR,SEA,,,,,,,
9997,N523AS,SMF,SEA,,,,,,,F,...,,TR,SEA,,,,,,,
9998,N8647A,ABQ,SEA,,,,,,,,...,,TR,SEA,,,,,,,


In [8]:
# Mudando o nome das colunas do df_airports_quality

df_airports_quality = df_airports_quality_original

for i in df_airports_quality.columns:
    df_airports_quality = df_airports_quality.withColumnRenamed(i,i+"_dest")

In [9]:
# Segundo join

df_quality = df_quality.join(df_airports_quality, (df_airports_quality.faa_dest == df_flights_quality.dest), "left")

df_quality.toPandas()

Unnamed: 0,tailnum_flights,dest,origin,qa_tailnum_flights,qa_year_month_day,qa_hour_minute,qa_dep_arr,qa_dep_arr_delay,qa_carrier,qa_flight,...,qa_tz_origin,qa_dst_origin,faa_dest,qa_faa_dest,qa_name_dest,qa_lat_dest,qa_lon_dest,qa_alt_dest,qa_tz_dest,qa_dst_dest
0,N846VA,LAX,SEA,,,,,,,,...,,,LAX,,,,,,,
1,N559AS,HNL,SEA,,,,,,,F,...,,,HNL,,,,,,,
2,N847VA,SFO,SEA,,,,,,,F,...,,,SFO,,,,,,,
3,N360SW,SJC,PDX,,,,,,,F,...,,,SJC,,,,,,,
4,N612AS,BUR,SEA,,,,,,,F,...,,,BUR,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,N225AG,SLC,SEA,,,,,,,,...,,,SLC,,,,,,,
9996,N3LEAA,DFW,SEA,F,,,,,,,...,,,DFW,,,,,,,
9997,N523AS,SMF,SEA,,,,,,,F,...,,,SMF,,,,,,,
9998,N8647A,ABQ,SEA,,,,,,,,...,,,ABQ,,,,,,,


In [10]:
# Terceiro join

df_quality = df_quality.join(df_planes_quality, df_planes_quality.tailnum_planes == df_quality.tailnum_flights,"left")


df_quality.toPandas()



Unnamed: 0,tailnum_flights,dest,origin,qa_tailnum_flights,qa_year_month_day,qa_hour_minute,qa_dep_arr,qa_dep_arr_delay,qa_carrier,qa_flight,...,tailnum_planes,qa_tailnum_planes,qa_year,qa_type,qa_manufacturer,qa_model,qa_engines,qa_seats,qa_speed,qa_engine
0,N846VA,LAX,SEA,,,,,,,,...,N846VA,,,,,,,,M,
1,N559AS,HNL,SEA,,,,,,,F,...,N559AS,,,,,,,,M,
2,N847VA,SFO,SEA,,,,,,,F,...,N847VA,,,,,,,,M,
3,N360SW,SJC,PDX,,,,,,,F,...,N360SW,,,,,,,,M,
4,N612AS,BUR,SEA,,,,,,,F,...,N612AS,,,,,,,,M,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,N225AG,SLC,SEA,,,,,,,,...,N225AG,,,,C,,,,M,
9996,N3LEAA,DFW,SEA,F,,,,,,,...,,,,,,,,,,
9997,N523AS,SMF,SEA,,,,,,,F,...,N523AS,,,,,,,,M,
9998,N8647A,ABQ,SEA,,,,,,,,...,N8647A,,,,,,,,M,


## Pergunta 2



In [11]:
# Remove colunas que não vão ser utilizadas (chaves)

df_quality = df_quality.drop('tailnum_flights', 'origin', 'dest', 'faa_origin', 'faa_dest', 'tailnum_planes')

# Gera leitura das colunas _qa
for c in df_quality.columns:
    df_quality.groupBy(c).count().show()
    

+------------------+-----+
|qa_tailnum_flights|count|
+------------------+-----+
|                 F|  987|
|              null| 8997|
|                 M|   14|
|                FN|    2|
+------------------+-----+

+-----------------+-----+
|qa_year_month_day|count|
+-----------------+-----+
|             null|10000|
+-----------------+-----+

+--------------+-----+
|qa_hour_minute|count|
+--------------+-----+
|          null| 9952|
|            MH|   48|
+--------------+-----+

+----------+-----+
|qa_dep_arr|count|
+----------+-----+
|      null| 9704|
|        MD|   48|
|        FA|  151|
|        MA|    7|
|        FD|   90|
+----------+-----+

+----------------+-----+
|qa_dep_arr_delay|count|
+----------------+-----+
|            null| 9925|
|              MD|   48|
|              MA|   27|
+----------------+-----+

+----------+-----+
|qa_carrier|count|
+----------+-----+
|      null|10000|
+----------+-----+

+---------+-----+
|qa_flight|count|
+---------+-----+
|        F| 615

In [12]:
#Modo automatizado

dicionario_colunas = {'F':[], 'M':[], 'I':[], 'S':[], 'T':[], 'C':[], 'Nulos':[]}

for c in df_quality.columns:
    linha = df_quality.groupBy(F.substring(c, 1, 1).alias(c)).count().collect()
        
    for n in range(len(linha)):
        clas = linha[n][0]

        if clas == 'M':
            dicionario_colunas['M'] = dicionario_colunas['M'] + [((c, linha[n][1]))]

        elif clas == 'F':
            dicionario_colunas['F'] = dicionario_colunas['F'] + [((c, linha[n][1]))]

        elif clas == 'I':
            dicionario_colunas['I'] = dicionario_colunas['I'] + [((c, linha[n][1]))]

        elif clas == 'S':
            dicionario_colunas['S'] = dicionario_colunas['S'] + [((c, linha[n][1]))]

        elif clas == 'T':
            dicionario_colunas['T'] = dicionario_colunas['T'] + [((c, linha[n][1]))]

        elif clas == 'C':
            dicionario_colunas['C']= dicionario_colunas['C'] + [((c, linha[n][1]))]

        elif clas == 'None':
            dicionario_colunas['Nulos']= dicionario_colunas['Nulos'] + [((c, linha[n][1]))]
         


def maior_valor(chave):
    valor = 0
    for i in range(len(dicionario_colunas[chave])):
        if dicionario_colunas[chave][i][1] > valor:
            valor = dicionario_colunas[chave][i][1]
            coluna = dicionario_colunas[chave][i][0]
            
    return coluna, valor




## Pergunta 3


In [13]:
maior_valor('M')

('qa_speed', 9443)

## Pergunta 4



In [14]:
maior_valor('F')

('qa_flight', 6158)

## Pergunta 5

 

In [15]:
maior_valor('I')

('qa_year', 8)

# Perguntas para negócios 

## Pergunta 1

In [16]:
airports_original = df_airports_proc

airports_original.show()

+---+--------------------+---------+-----------+----+---+---+-------------+----+--------+--------------+
|faa|                name|      lat|        lon| alt| tz|dst|       region|type|military|administration|
+---+--------------------+---------+-----------+----+---+---+-------------+----+--------+--------------+
|04G|   Lansdowne Airport|41.130474|  -80.61958|1044| -5|  A|MAINLAND-EAST|  AP|   false|           NaN|
|06A|Moton Field Munic...| 32.46057|  -85.68003| 264| -5|  A|MAINLAND-EAST|  AP|   false|             M|
|06C| Schaumburg Regional| 41.98934|  -88.10124| 801| -6|  A|MAINLAND-EAST| NaN|   false|             R|
|06N|     Randall Airport| 41.43191|  -74.39156| 523| -5|  A|MAINLAND-EAST|  AP|   false|           NaN|
|09J|Jekyll Island Air...|31.074472|  -81.42778|  11| -4|  A|MAINLAND-EAST|  AP|   false|           NaN|
|0A9|Elizabethton Muni...|36.371223| -82.173416|1593| -4|  A|MAINLAND-EAST|  AP|   false|             M|
|0G6|Williams County A...|41.467304| -84.506775| 730| -

In [17]:
# Mudando nome colunas

for i in df_airports_proc.columns:
    df_airports_proc = df_airports_proc.withColumnRenamed(i,i+"_origin")
    
df_airports_proc.show()

df_flights_proc = df_flights_proc.withColumnRenamed('tailnum','tailnum_flights')

df_planes_proc = df_planes_proc.withColumnRenamed('tailnum','tailnum_planes')

+----------+--------------------+----------+-----------+----------+---------+----------+-------------+-----------+---------------+---------------------+
|faa_origin|         name_origin|lat_origin| lon_origin|alt_origin|tz_origin|dst_origin|region_origin|type_origin|military_origin|administration_origin|
+----------+--------------------+----------+-----------+----------+---------+----------+-------------+-----------+---------------+---------------------+
|       04G|   Lansdowne Airport| 41.130474|  -80.61958|      1044|       -5|         A|MAINLAND-EAST|         AP|          false|                  NaN|
|       06A|Moton Field Munic...|  32.46057|  -85.68003|       264|       -5|         A|MAINLAND-EAST|         AP|          false|                    M|
|       06C| Schaumburg Regional|  41.98934|  -88.10124|       801|       -6|         A|MAINLAND-EAST|        NaN|          false|                    R|
|       06N|     Randall Airport|  41.43191|  -74.39156|       523|       -5|     

In [18]:
# Primeiro Join

df_transformation = df_flights_proc.join(df_airports_proc, (df_airports_proc.faa_origin == df_flights_proc.origin), "left")

df_transformation.toPandas()

Unnamed: 0,dep_time,dep_delay,arr_time,arr_delay,carrier,tailnum_flights,flight,origin,dest,air_time,...,name_origin,lat_origin,lon_origin,alt_origin,tz_origin,dst_origin,region_origin,type_origin,military_origin,administration_origin
0,658,-7,935,-5,VX,N846VA,1780,SEA,LAX,132,...,Seattle Tacoma Intl,47.449001,-122.309303,433,-8,A,MAINLAND-WEST,,False,I
1,1040,5,1505,5,AS,N559AS,851,SEA,HNL,360,...,Seattle Tacoma Intl,47.449001,-122.309303,433,-8,A,MAINLAND-WEST,,False,I
2,1443,-2,1652,2,VX,N847VA,755,SEA,SFO,111,...,Seattle Tacoma Intl,47.449001,-122.309303,433,-8,A,MAINLAND-WEST,,False,I
3,1705,45,1839,34,WN,N360SW,344,PDX,SJC,83,...,Portland Intl,45.588722,-122.597504,30,-8,A,MAINLAND-WEST,,False,I
4,754,-1,1015,1,AS,N612AS,522,SEA,BUR,127,...,Seattle Tacoma Intl,47.449001,-122.309303,433,-8,A,MAINLAND-WEST,,False,I
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1806,-4,2104,-6,OO,N225AG,3458,SEA,SLC,89,...,Seattle Tacoma Intl,47.449001,-122.309303,433,-8,A,MAINLAND-WEST,,False,I
9996,2336,11,452,-13,AA,N3LEAA,1230,SEA,DFW,178,...,Seattle Tacoma Intl,47.449001,-122.309303,433,-8,A,MAINLAND-WEST,,False,I
9997,904,-1,1042,-5,AS,N523AS,360,SEA,SMF,81,...,Seattle Tacoma Intl,47.449001,-122.309303,433,-8,A,MAINLAND-WEST,,False,I
9998,1441,26,1820,10,WN,N8647A,2857,SEA,ABQ,133,...,Seattle Tacoma Intl,47.449001,-122.309303,433,-8,A,MAINLAND-WEST,,False,I


In [19]:
# Mudando nome das colunas

df_airports_proc = airports_original

for i in df_airports_proc.columns:
    df_airports_proc = df_airports_proc.withColumnRenamed(i,i+"_dest")
    
df_airports_proc.show()



+--------+--------------------+---------+-----------+--------+-------+--------+-------------+---------+-------------+-------------------+
|faa_dest|           name_dest| lat_dest|   lon_dest|alt_dest|tz_dest|dst_dest|  region_dest|type_dest|military_dest|administration_dest|
+--------+--------------------+---------+-----------+--------+-------+--------+-------------+---------+-------------+-------------------+
|     04G|   Lansdowne Airport|41.130474|  -80.61958|    1044|     -5|       A|MAINLAND-EAST|       AP|        false|                NaN|
|     06A|Moton Field Munic...| 32.46057|  -85.68003|     264|     -5|       A|MAINLAND-EAST|       AP|        false|                  M|
|     06C| Schaumburg Regional| 41.98934|  -88.10124|     801|     -6|       A|MAINLAND-EAST|      NaN|        false|                  R|
|     06N|     Randall Airport| 41.43191|  -74.39156|     523|     -5|       A|MAINLAND-EAST|       AP|        false|                NaN|
|     09J|Jekyll Island Air...|31.

In [20]:
# Segundo Join

df_transformation = df_transformation.join(df_airports_proc, (df_airports_proc.faa_dest == df_flights_proc.dest), "left")

df_transformation.toPandas()


Unnamed: 0,dep_time,dep_delay,arr_time,arr_delay,carrier,tailnum_flights,flight,origin,dest,air_time,...,name_dest,lat_dest,lon_dest,alt_dest,tz_dest,dst_dest,region_dest,type_dest,military_dest,administration_dest
0,658,-7,935,-5,VX,N846VA,1780,SEA,LAX,132,...,Los Angeles Intl,33.942535,-118.408073,126,-8,A,MAINLAND-WEST,,False,I
1,1040,5,1505,5,AS,N559AS,851,SEA,HNL,360,...,Honolulu Intl,21.318682,-157.922424,13,-10,N,ALASKA,,False,I
2,1443,-2,1652,2,VX,N847VA,755,SEA,SFO,111,...,San Francisco Intl,37.618973,-122.374886,13,-8,A,MAINLAND-WEST,,False,I
3,1705,45,1839,34,WN,N360SW,344,PDX,SJC,83,...,Norman Y Mineta San Jose Intl,37.362598,-121.929024,62,-8,A,MAINLAND-WEST,,False,I
4,754,-1,1015,1,AS,N612AS,522,SEA,BUR,127,...,Bob Hope,34.200668,-118.358665,778,-8,A,MAINLAND-WEST,,False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1806,-4,2104,-6,OO,N225AG,3458,SEA,SLC,89,...,Salt Lake City Intl,40.788387,-111.977776,4227,-7,A,MAINLAND-WEST,,False,I
9996,2336,11,452,-13,AA,N3LEAA,1230,SEA,DFW,178,...,Dallas Fort Worth Intl,32.896828,-97.037994,607,-6,A,MAINLAND-WEST,,False,I
9997,904,-1,1042,-5,AS,N523AS,360,SEA,SMF,81,...,Sacramento Intl,38.695415,-121.590775,27,-8,A,MAINLAND-WEST,,False,I
9998,1441,26,1820,10,WN,N8647A,2857,SEA,ABQ,133,...,Albuquerque International Sunport,35.040222,-106.609192,5355,-7,A,MAINLAND-WEST,,False,I


In [21]:
# Terceiro Join

df_transformation = df_transformation.join(df_planes_proc, df_planes_proc.tailnum_planes == df_transformation.tailnum_flights,"left")


df_transformation.toPandas()

Unnamed: 0,dep_time,dep_delay,arr_time,arr_delay,carrier,tailnum_flights,flight,origin,dest,air_time,...,type,manufacturer,model,engines,seats,speed,engine,tailchar,age,engine_type
0,658,-7,935,-5,VX,N846VA,1780,SEA,LAX,132,...,MULTI_ENG,AIRBUS,A320-214,2.0,182.0,0.0,Turbo-fan,VA,11.0,FAN
1,1040,5,1505,5,AS,N559AS,851,SEA,HNL,360,...,MULTI_ENG,BOEING,737-890,2.0,149.0,0.0,Turbo-fan,AS,16.0,FAN
2,1443,-2,1652,2,VX,N847VA,755,SEA,SFO,111,...,MULTI_ENG,AIRBUS,A320-214,2.0,182.0,0.0,Turbo-fan,VA,11.0,FAN
3,1705,45,1839,34,WN,N360SW,344,PDX,SJC,83,...,MULTI_ENG,BOEING,737-3H4,2.0,149.0,0.0,Turbo-fan,SW,30.0,FAN
4,754,-1,1015,1,AS,N612AS,522,SEA,BUR,127,...,MULTI_ENG,BOEING,737-790,2.0,151.0,0.0,Turbo-jet,AS,23.0,JET
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1806,-4,2104,-6,OO,N225AG,3458,SEA,SLC,89,...,MULTI_ENG,BOMBARDIER,CL-600-2C10,2.0,80.0,0.0,Turbo-fan,AG,21.0,FAN
9996,2336,11,452,-13,AA,N3LEAA,1230,SEA,DFW,178,...,,,,,,,,,,
9997,904,-1,1042,-5,AS,N523AS,360,SEA,SMF,81,...,MULTI_ENG,BOEING,737-890,2.0,149.0,0.0,Turbo-fan,AS,13.0,FAN
9998,1441,26,1820,10,WN,N8647A,2857,SEA,ABQ,133,...,MULTI_ENG,BOEING,737-8H4,2.0,140.0,0.0,Turbo-fan,A,8.0,FAN


In [22]:
df_transformation.printSchema()

root
 |-- dep_time: string (nullable = true)
 |-- dep_delay: integer (nullable = true)
 |-- arr_time: string (nullable = true)
 |-- arr_delay: integer (nullable = true)
 |-- carrier: string (nullable = true)
 |-- tailnum_flights: string (nullable = true)
 |-- flight: string (nullable = true)
 |-- origin: string (nullable = true)
 |-- dest: string (nullable = true)
 |-- air_time: integer (nullable = true)
 |-- distance: integer (nullable = true)
 |-- dep_datetime: timestamp (nullable = true)
 |-- air_time_projected: integer (nullable = true)
 |-- air_time_expected: integer (nullable = true)
 |-- haul_duration: string (nullable = true)
 |-- dep_season: string (nullable = true)
 |-- dep_delay_category: string (nullable = true)
 |-- faa_origin: string (nullable = true)
 |-- name_origin: string (nullable = true)
 |-- lat_origin: float (nullable = true)
 |-- lon_origin: float (nullable = true)
 |-- alt_origin: integer (nullable = true)
 |-- tz_origin: integer (nullable = true)
 |-- dst_origi

## Pergunta 2



In [23]:
(df_transformation.groupBy(F.col('region_origin'))
                  .agg(F.countDistinct('region_origin', 'name_origin')
                  .alias('count')).toPandas())

Unnamed: 0,region_origin,count
0,MAINLAND-WEST,2


In [24]:
(df_transformation.groupBy(F.col('region_dest'))
                  .agg(F.countDistinct('region_dest', 'name_dest')
                  .alias('count')).toPandas())

Unnamed: 0,region_dest,count
0,ALASKA,9
1,MAINLAND-EAST,24
2,MAINLAND-WEST,36


## Pergunta 3



In [25]:
(df_transformation.agg(F.max(F.abs(F.col('alt_origin')-F.col('alt_dest')))
                 .alias("Maior diferença de altitude"))
                 .show())

df_transformation.agg(F.max(F.col('alt_origin'))).show()
df_transformation.agg(F.max(F.col('alt_dest'))).show()

+---------------------------+
|Maior diferença de altitude|
+---------------------------+
|                       6169|
+---------------------------+

+---------------+
|max(alt_origin)|
+---------------+
|            433|
+---------------+

+-------------+
|max(alt_dest)|
+-------------+
|         6602|
+-------------+



## Pergunta 4



In [26]:
# Pensando em atrasos como a soma dos atrasos de chegada e de saida

df_transformation = df_transformation.withColumn('dep_delay', F.when(F.col('dep_delay') < 0, 0)
                                     .otherwise(F.col('dep_delay')))

df_transformation = df_transformation.withColumn('arr_delay', F.when(F.col('arr_delay') < 0, 0)
                                     .otherwise(F.col('arr_delay')))

df_transformation = df_transformation.withColumn("total_delay",
                                     F.col('arr_delay')+ F.col('dep_delay'))

(df_transformation.filter(F.col('total_delay')>0)
                  .agg(F.round(F.avg(F.col('total_delay'))))
                  .alias('Média de atrasos')
                  .show())

+--------------------------+
|round(avg(total_delay), 0)|
+--------------------------+
|                      38.0|
+--------------------------+



In [27]:
# Pensando em tipos de atrasos separados

(df_transformation.filter(F.col('arr_delay')> 0)
                  .agg(F.round(F.avg(F.col('arr_delay')))
                  .alias('Média de atrasos'))
                  .show())

(df_transformation.filter(F.col('dep_delay')> 0)
                 .agg(F.round(F.avg(F.col('dep_delay')))
                 .alias('Média de atrasos'))
                 .show())

+----------------+
|Média de atrasos|
+----------------+
|            25.0|
+----------------+

+----------------+
|Média de atrasos|
+----------------+
|            26.0|
+----------------+



## Pergunta 5



In [28]:
# Soma de atrasos
(df_transformation.filter(F.col('total_delay')>0)
                  .groupBy(F.col('region_dest'))
                  .agg(F.round(F.avg(F.col('total_delay')))
                  .alias('Média de atrasos total por região'))
                  .show())


+-------------+---------------------------------+
|  region_dest|Média de atrasos total por região|
+-------------+---------------------------------+
|       ALASKA|                             31.0|
|MAINLAND-EAST|                             40.0|
|MAINLAND-WEST|                             38.0|
+-------------+---------------------------------+



In [29]:
# Atrasos separados

(df_transformation.filter(F.col('arr_delay')> 0)
                  .groupBy(F.col('region_dest'))
                  .agg(F.round(F.avg(F.col('arr_delay')))
                  .alias('Média de atrasos de chegada'))
                  .show())

(df_transformation.filter(F.col('dep_delay')> 0)
                  .groupBy(F.col('region_dest'))
                  .agg(F.round(F.avg(F.col('dep_delay')))
                  .alias('Média de atrasos de saida'))
                  .show())

+-------------+---------------------------+
|  region_dest|Média de atrasos de chegada|
+-------------+---------------------------+
|       ALASKA|                       22.0|
|MAINLAND-EAST|                       29.0|
|MAINLAND-WEST|                       24.0|
+-------------+---------------------------+

+-------------+-------------------------+
|  region_dest|Média de atrasos de saida|
+-------------+-------------------------+
|       ALASKA|                     21.0|
|MAINLAND-EAST|                     26.0|
|MAINLAND-WEST|                     26.0|
+-------------+-------------------------+



## Pergunta 6



In [30]:
df_transformation= df_transformation.withColumn("dep_year", 
    F.date_format(F.col("dep_datetime"), "yyyy"))


(df_transformation.filter(F.col('total_delay')>0)
                  .groupBy(F.col('dep_year'))
                  .agg(F.sum(F.col('total_delay'))
                  .alias('acumulado por ano'))
                  .show())

+--------+-----------------+
|dep_year|acumulado por ano|
+--------+-----------------+
|    2014|           180134|
+--------+-----------------+



## Pergunta 7



In [31]:
# Atraso somado

(df_transformation.filter(F.col('total_delay')>0)
                  .groupBy(F.col('dep_year'),F.col('region_dest'))
                  .agg(F.sum(F.col('total_delay'))
                  .alias('acumulado por ano'))
                  .show())


+--------+-------------+-----------------+
|dep_year|  region_dest|acumulado por ano|
+--------+-------------+-----------------+
|    2014|       ALASKA|            16155|
|    2014|MAINLAND-EAST|            44476|
|    2014|MAINLAND-WEST|           119503|
+--------+-------------+-----------------+



In [32]:
# Atraso separado

(df_transformation.filter(F.col('arr_delay')>0)
                  .groupBy(F.col('dep_year'),F.col('region_dest'))
                  .agg(F.sum(F.col('arr_delay'))
                  .alias('acumulado por ano'))
                  .show())

(df_transformation.filter(F.col('dep_delay')>0)
                  .groupBy(F.col('dep_year'),F.col('region_dest'))
                  .agg(F.sum(F.col('dep_delay'))
                  .alias('acumulado por ano'))
                  .show())

+--------+-------------+-----------------+
|dep_year|  region_dest|acumulado por ano|
+--------+-------------+-----------------+
|    2014|       ALASKA|             8640|
|    2014|MAINLAND-EAST|            22938|
|    2014|MAINLAND-WEST|            60242|
+--------+-------------+-----------------+

+--------+-------------+-----------------+
|dep_year|  region_dest|acumulado por ano|
+--------+-------------+-----------------+
|    2014|       ALASKA|             7515|
|    2014|MAINLAND-EAST|            21538|
|    2014|MAINLAND-WEST|            59261|
+--------+-------------+-----------------+



## Pergunta 8



In [33]:
(df_transformation.agg(F.round(F.avg(F.col('air_time')))
                  .alias('Média de tempo de voo'))
                  .show())

+---------------------+
|Média de tempo de voo|
+---------------------+
|                153.0|
+---------------------+



## Pergunta 9



In [34]:
(df_transformation.groupBy('region_dest')
                  .agg(F.round(F.avg(F.col('air_time')))
                  .alias('Tempo de voo médio por região'))
                  .show())

+-------------+-----------------------------+
|  region_dest|Tempo de voo médio por região|
+-------------+-----------------------------+
|       ALASKA|                        228.0|
|MAINLAND-EAST|                        237.0|
|MAINLAND-WEST|                        115.0|
+-------------+-----------------------------+



## Pergunta 10



In [35]:
(df_transformation.groupBy('origin','dest')
                  .agg(F.round(F.avg(F.col('air_time')))
                  .alias('Tempo de voo médio por rota'))
                  .show(100))

+------+----+---------------------------+
|origin|dest|Tempo de voo médio por rota|
+------+----+---------------------------+
|   SEA| RNO|                       74.0|
|   SEA| DTW|                      220.0|
|   SEA| CLE|                      234.0|
|   SEA| LAX|                      127.0|
|   PDX| SEA|                       35.0|
|   SEA| BLI|                       23.0|
|   PDX| IAH|                      214.0|
|   PDX| PHX|                      130.0|
|   SEA| SLC|                       89.0|
|   SEA| SBA|                      118.0|
|   SEA| BWI|                      270.0|
|   PDX| IAD|                      268.0|
|   PDX| SFO|                       85.0|
|   SEA| KOA|                      347.0|
|   PDX| MCI|                      174.0|
|   SEA| SJC|                      103.0|
|   SEA| ABQ|                      143.0|
|   SEA| SAT|                      208.0|
|   PDX| ONT|                      112.0|
|   SEA| LAS|                      118.0|
|   SEA| GEG|                     

## Pergunta 11



In [36]:
(df_transformation.groupBy('dep_year')
                  .agg(F.sum(F.col('air_time'))
                  .alias('Tempo de voo acomulado por ano'))
                  .show())

+--------+------------------------------+
|dep_year|Tempo de voo acomulado por ano|
+--------+------------------------------+
|    2014|                       1528625|
+--------+------------------------------+



## Pergunta 12



In [37]:
(df_transformation.groupBy('region_dest')
                  .agg(F.sum(F.col('air_time'))
                  .alias('Tempo de voo acomulado por região'))
                  .show())

+-------------+---------------------------------+
|  region_dest|Tempo de voo acomulado por região|
+-------------+---------------------------------+
|       ALASKA|                           230602|
|MAINLAND-EAST|                           508344|
|MAINLAND-WEST|                           789679|
+-------------+---------------------------------+



## Pergunta 13



In [38]:
(df_transformation.agg(F.round(F.avg(F.col('distance')))
                  .alias('Média de distancia'))
                  .show())

+------------------+
|Média de distancia|
+------------------+
|            1208.0|
+------------------+



## Pergunta 14



In [39]:
(df_transformation.groupBy('region_dest')
                  .agg(F.round(F.avg(F.col('distance')))
                  .alias('Média de distancia'))
                  .show())



+-------------+------------------+
|  region_dest|Média de distancia|
+-------------+------------------+
|       ALASKA|            1742.0|
|MAINLAND-EAST|            2042.0|
|MAINLAND-WEST|             868.0|
+-------------+------------------+



## Pergunta 15



In [40]:
(df_transformation.groupBy('origin','dest')
                  .agg(F.avg(F.col('distance'))
                  .alias('Média de distancia'))
                  .show())

+------+----+------------------+
|origin|dest|Média de distancia|
+------+----+------------------+
|   SEA| RNO|             564.0|
|   SEA| DTW|            1927.0|
|   SEA| CLE|            2021.0|
|   SEA| LAX|             954.0|
|   PDX| SEA|             129.0|
|   SEA| BLI|              93.0|
|   PDX| IAH|            1825.0|
|   PDX| PHX|            1009.0|
|   SEA| SLC|             689.0|
|   SEA| SBA|             908.0|
|   SEA| BWI|            2335.0|
|   PDX| IAD|            2327.0|
|   PDX| SFO|             550.0|
|   SEA| KOA|            2688.0|
|   PDX| MCI|            1482.0|
|   SEA| SJC|             697.0|
|   SEA| ABQ|            1180.0|
|   SEA| SAT|            1774.0|
|   PDX| ONT|             838.0|
|   SEA| LAS|             867.0|
+------+----+------------------+
only showing top 20 rows



## Pergunta 16



In [41]:
(df_transformation.groupBy('dep_year')
                  .agg(F.sum(F.col('distance'))
                  .alias('distancia de voo acumulada'))
                  .show())

+--------+--------------------------+
|dep_year|distancia de voo acumulada|
+--------+--------------------------+
|    2014|                  12081516|
+--------+--------------------------+



## Pergunta 17



In [42]:
(df_transformation.groupBy('region_dest')
                  .agg(F.sum(F.col('distance'))
                  .alias('distancia de voo acumulada'))
                  .show())



+-------------+--------------------------+
|  region_dest|distancia de voo acumulada|
+-------------+--------------------------+
|       ALASKA|                   1762553|
|MAINLAND-EAST|                   4378902|
|MAINLAND-WEST|                   5940061|
+-------------+--------------------------+



## Pergunta 18



In [43]:
(df_transformation.groupBy('origin','dest')
                  .agg(F.ceil(F.avg(F.col('seats')))
                  .alias('Número médio de passageiros por rota'))
                  .show())

+------+----+------------------------------------+
|origin|dest|Número médio de passageiros por rota|
+------+----+------------------------------------+
|   SEA| RNO|                                 142|
|   SEA| DTW|                                 213|
|   SEA| CLE|                                 182|
|   SEA| LAX|                                 155|
|   PDX| SEA|                                  65|
|   SEA| BLI|                                 164|
|   PDX| IAH|                                 183|
|   PDX| PHX|                                 196|
|   SEA| SLC|                                 166|
|   SEA| SBA|                                  80|
|   SEA| BWI|                                 152|
|   PDX| IAD|                                 188|
|   PDX| SFO|                                 139|
|   SEA| KOA|                                 171|
|   PDX| MCI|                                 147|
|   SEA| SJC|                                 137|
|   SEA| ABQ|                  

## Pergunta 19



In [44]:
(df_transformation.groupBy('dep_year')
                  .agg(F.sum(F.col('seats'))
                  .alias('Passegeiros acumulados por ano'))
                  .show())

+--------+------------------------------+
|dep_year|Passegeiros acumulados por ano|
+--------+------------------------------+
|    2014|                       1509544|
+--------+------------------------------+



## Pergunta 20



In [45]:
(df_transformation.groupBy('dest')
                  .count()                  
                  .orderBy(F.col('count').desc())
                  .show(1))

+----+-----+
|dest|count|
+----+-----+
| SFO|  787|
+----+-----+
only showing top 1 row



## Pergunta 21



In [46]:
(df_transformation.groupBy('dest')
                  .agg(F.sum(F.col('seats')))
                  .orderBy(F.col('sum(seats)').desc())
                  .show(1))

+----+----------+
|dest|sum(seats)|
+----+----------+
| SFO|    119635|
+----+----------+
only showing top 1 row



## Pergunta 22



In [47]:
(df_transformation.select('origin','dest','distance')
                  .filter(F.col('origin')== 'PDX')
                  .orderBy(F.col('distance').desc())
                  .show(1))


+------+----+--------+
|origin|dest|distance|
+------+----+--------+
|   PDX| LIH|    2631|
+------+----+--------+
only showing top 1 row



## Pergunta 23 



In [48]:
df_transformation= df_transformation.withColumn("dep_month", 
    F.date_format(F.col("dep_datetime"), "MM"))

(df_transformation.groupBy('dest','dep_month')
                  .count()
                  .orderBy(F.col('count').desc())
                  .show(1))

+----+---------+-----+
|dest|dep_month|count|
+----+---------+-----+
| LAX|       05|   77|
+----+---------+-----+
only showing top 1 row



## Pergunta 24



In [49]:
(df_transformation.groupBy('model')
                  .count()                  
                  .orderBy(F.col('count').desc())
                  .show(1))

+-------+-----+
|  model|count|
+-------+-----+
|737-890| 1463|
+-------+-----+
only showing top 1 row



## Pergunta 25



In [50]:
(df_transformation.filter(F.col('model')
                  .isNotNull())
                  .groupBy('model','dest')
                  .count()
                  .orderBy(F.col('count').desc())
                  .show(1000))

+-------------+----+-----+
|        model|dest|count|
+-------------+----+-----+
|      737-7H4| OAK|  141|
|      737-890| ANC|  138|
|      737-790| SNA|  122|
|      737-7H4| SMF|  114|
|      737-890| LAX|  110|
|      737-890| LAS|  108|
|      757-232| SLC|  104|
|      737-890| SAN|  102|
|     A320-214| DEN|  102|
|     A320-232| LGB|  102|
|      737-7H4| LAS|   97|
|      737-7H4| DEN|   93|
|     A320-232| PHX|   91|
|      737-7H4| SJC|   89|
|      737-490| SFO|   87|
|    EMB-120ER| PDX|   87|
|     A319-131| SFO|   86|
|     A320-232| SFO|   85|
|      737-7H4| PHX|   80|
|    737-932ER| ATL|   80|
|      737-890| SMF|   79|
|    737-990ER| ANC|   78|
|      737-3H4| OAK|   76|
|     A320-214| SFO|   75|
|    737-990ER| LAX|   73|
|  CL-600-2B19| SFO|   72|
|  CL-600-2C10| BUR|   70|
|      737-890| SJC|   70|
|      737-890| DCA|   69|
|      737-890| SNA|   68|
|      737-890| OGG|   67|
|     A320-232| IAH|   67|
|    EMB-120ER| SEA|   67|
|  CL-600-2D24| LAS|   67|
|

## Pergunta 26



In [51]:
(df_transformation.groupBy('haul_duration')
                  .agg(F.ceil(F.avg(F.col('engines')))
                  .alias("Número médio de motores"))
                  .show())


+-------------+-----------------------+
|haul_duration|Número médio de motores|
+-------------+-----------------------+
|    LONG-HAUL|                      2|
|  MEDIUM-HAUL|                      2|
|   SHORT-HAUL|                      2|
+-------------+-----------------------+



## Pergunta 27



In [52]:
(df_transformation.groupBy('dep_season')
                  .count()
                  .orderBy(F.col('count').desc())
                  .show(200))

+----------+-----+
|dep_season|count|
+----------+-----+
|    SUMMER| 2918|
|      FALL| 2639|
|    SPRING| 2560|
|    WINTER| 1883|
+----------+-----+



## Pergunta 28



In [53]:
(df_transformation.groupBy('dest','dep_season')
                  .count()
                  .orderBy(F.col('count').desc())
                  .show(1))

+----+----------+-----+
|dest|dep_season|count|
+----+----------+-----+
| SFO|      FALL|  228|
+----+----------+-----+
only showing top 1 row



## Pergunta 29



In [54]:
(df_transformation.filter((F.col('dep_delay_category') != 'ANTECIPATED') & (F.col('dep_delay_category') != 'INTIME'))
                  .groupBy('dep_delay_category')
                  .count()
                  .orderBy(F.col('count').desc())
                  .show(1))


+------------------+-----+
|dep_delay_category|count|
+------------------+-----+
|             MINOR| 3065|
+------------------+-----+
only showing top 1 row



## Pergunta 30



In [55]:
(df_transformation.filter((F.col('dep_delay_category') != 'ANTECIPATED') & (F.col('dep_delay_category') != 'INTIME'))
                  .groupBy('origin', 'dest','dep_delay_category')
                  .count()
                  .orderBy(F.col('count').desc())
                  .show(200))

+------+----+------------------+-----+
|origin|dest|dep_delay_category|count|
+------+----+------------------+-----+
|   SEA| SFO|             MINOR|  159|
|   SEA| ANC|             MINOR|  145|
|   SEA| DEN|             MINOR|  145|
|   SEA| LAX|             MINOR|  118|
|   SEA| LAS|             MINOR|  114|
|   SEA| OAK|             MINOR|   97|
|   SEA| PHX|             MINOR|   97|
|   SEA| DFW|             MINOR|   90|
|   SEA| ORD|             MINOR|   88|
|   PDX| SFO|             MINOR|   80|
|   PDX| DEN|             MINOR|   77|
|   SEA| SLC|             MINOR|   72|
|   SEA| IAH|             MINOR|   68|
|   PDX| OAK|             MINOR|   64|
|   SEA| SJC|             MINOR|   63|
|   SEA| ATL|             MINOR|   62|
|   PDX| PHX|             MINOR|   61|
|   SEA| SMF|             MINOR|   60|
|   SEA| MSP|             MINOR|   55|
|   PDX| ORD|             MINOR|   54|
|   PDX| LAS|             MINOR|   52|
|   PDX| SJC|             MINOR|   52|
|   SEA| SAN|            