## Flights Null Treatment

Notebook for the exploration of the null values and the null value treatment

---
**INPUT** raw flights data
**OUTPUT** flights nulls removed

---

In [78]:
import pandas as pd
import math

In [126]:
WN = pd.read_csv("carrier_WN.csv")

In [9]:
WN.columns

Index(['fl_date', 'mkt_unique_carrier', 'branded_code_share', 'mkt_carrier',
       'mkt_carrier_fl_num', 'op_unique_carrier', 'tail_num',
       'op_carrier_fl_num', 'origin_airport_id', 'origin', 'origin_city_name',
       'dest_airport_id', 'dest', 'dest_city_name', 'crs_dep_time', 'dep_time',
       'dep_delay', 'taxi_out', 'wheels_off', 'wheels_on', 'taxi_in',
       'crs_arr_time', 'arr_time', 'arr_delay', 'cancelled',
       'cancellation_code', 'diverted', 'dup', 'crs_elapsed_time',
       'actual_elapsed_time', 'air_time', 'flights', 'distance',
       'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay',
       'late_aircraft_delay', 'first_dep_time', 'total_add_gtime',
       'longest_add_gtime'],
      dtype='object')

In [127]:
WN.tail_num.fillna("EMPTY", inplace=True)

In [128]:
WN.dep_delay.fillna(0, inplace=True)

In [129]:
WN.taxi_out.fillna(0, inplace=True)

In [130]:
WN.wheels_off.fillna(0, inplace=True)

In [81]:
def check_collection_drop(col1, col2):
    if math.isnan(col1):
        if col2 == 1:
            return 0
    return col1 

In [131]:
WN.wheels_on = WN.apply(lambda x: check_collection_drop(x.wheels_on, x.cancelled),axis=1)
WN = WN[WN.wheels_on.notnull()]

In [132]:
WN.taxi_in = WN.apply(lambda x: check_collection_drop(x.taxi_in, x.cancelled),axis=1)
WN = WN[WN.taxi_in.notnull()]

In [133]:
WN.arr_time = WN.apply(lambda x: check_collection_drop(x.arr_time, x.cancelled),axis=1)
WN = WN[WN.arr_time.notnull()]

In [134]:
WN.arr_delay = WN.apply(lambda x: check_collection_drop(x.arr_delay, x.cancelled),axis=1)
WN = WN[WN.arr_delay.notnull()]

In [135]:
WN.cancellation_code.fillna(0, inplace=True)

In [136]:
WN.actual_elapsed_time = WN.apply(lambda x: check_collection_drop(x.actual_elapsed_time, x.cancelled),axis=1)
WN = WN[WN.actual_elapsed_time.notnull()]

In [137]:
WN.air_time = WN.apply(lambda x: check_collection_drop(x.air_time, x.cancelled),axis=1)
WN = WN[WN.air_time.notnull()]

In [138]:
def check_arr_delay (col1, col2):
    if math.isnan(col1):
        if col2 < 15:
            return 0
    return col1

In [139]:
WN.shape

(2708601, 41)

In [140]:
WN.carrier_delay= WN.apply(lambda x: check_arr_delay(x.carrier_delay, x.arr_delay),axis=1)
WN = WN[WN.carrier_delay.notnull()]

In [141]:
WN.shape

(2708601, 41)

In [142]:
WN.weather_delay= WN.apply(lambda x: check_arr_delay(x.weather_delay, x.arr_delay),axis=1)
WN.nas_delay= WN.apply(lambda x: check_arr_delay(x.nas_delay, x.arr_delay),axis=1)
WN.security_delay= WN.apply(lambda x: check_arr_delay(x.security_delay, x.arr_delay),axis=1)
WN.late_aircraft_delay= WN.apply(lambda x: check_arr_delay(x.late_aircraft_delay, x.arr_delay),axis=1)

In [144]:
def replace_with_column (col1, col2):
    if math.isnan(col1):
        return col2
    return col1

In [145]:
WN.dep_time= WN.apply(lambda x: replace_with_column(x.dep_time, x.crs_dep_time),axis=1)

In [146]:
WN.first_dep_time= WN.apply(lambda x: replace_with_column(x.first_dep_time, x.dep_time),axis=1)

In [147]:
WN.total_add_gtime.fillna(0, inplace=True)
WN.longest_add_gtime.fillna(0, inplace=True)

In [160]:
WN.to_csv("carrier_WN_nullsremoved.csv", index=False)

## Repeat for Remaining Carriers

In [161]:
def remove_nulls (WN):
    WN.tail_num.fillna("EMPTY", inplace=True)
    WN.dep_delay.fillna(0, inplace=True)
    WN.taxi_out.fillna(0, inplace=True)
    WN.wheels_off.fillna(0, inplace=True)
    WN.cancellation_code.fillna(0, inplace=True)
    WN.wheels_on = WN.apply(lambda x: check_collection_drop(x.wheels_on, x.cancelled),axis=1)
    WN = WN[WN.wheels_on.notnull()]
    WN.taxi_in = WN.apply(lambda x: check_collection_drop(x.taxi_in, x.cancelled),axis=1)
    WN = WN[WN.taxi_in.notnull()]
    WN.arr_time = WN.apply(lambda x: check_collection_drop(x.arr_time, x.cancelled),axis=1)
    WN = WN[WN.arr_time.notnull()]
    WN.arr_delay = WN.apply(lambda x: check_collection_drop(x.arr_delay, x.cancelled),axis=1)
    WN = WN[WN.arr_delay.notnull()]
    WN.actual_elapsed_time = WN.apply(lambda x: check_collection_drop(x.actual_elapsed_time, x.cancelled),axis=1)
    WN = WN[WN.actual_elapsed_time.notnull()]
    WN.air_time = WN.apply(lambda x: check_collection_drop(x.air_time, x.cancelled),axis=1)
    WN = WN[WN.air_time.notnull()]
    WN.carrier_delay= WN.apply(lambda x: check_arr_delay(x.carrier_delay, x.arr_delay),axis=1)
    WN = WN[WN.carrier_delay.notnull()]
    WN.weather_delay= WN.apply(lambda x: check_arr_delay(x.weather_delay, x.arr_delay),axis=1)
    WN.nas_delay= WN.apply(lambda x: check_arr_delay(x.nas_delay, x.arr_delay),axis=1)
    WN.security_delay= WN.apply(lambda x: check_arr_delay(x.security_delay, x.arr_delay),axis=1)
    WN.late_aircraft_delay= WN.apply(lambda x: check_arr_delay(x.late_aircraft_delay, x.arr_delay),axis=1)
    WN.dep_time= WN.apply(lambda x: replace_with_column(x.dep_time, x.crs_dep_time),axis=1)
    WN.first_dep_time= WN.apply(lambda x: replace_with_column(x.first_dep_time, x.dep_time),axis=1)
    WN.total_add_gtime.fillna(0, inplace=True)
    WN.longest_add_gtime.fillna(0, inplace=True)
    return WN

In [162]:
AA = pd.read_csv("carrier_AA.csv")
print(AA.shape)
AA = remove_nulls(AA)
print(AA.shape)
WN.to_csv("carrier_AA_nullsremoved.csv", index=False)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


(4138991, 41)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


(4128169, 41)


In [163]:
AA.to_csv("carrier_AA_nullsremoved.csv", index=False)

In [164]:
AS = pd.read_csv("carrier_AS.csv")
AS = remove_nulls(AS)
AS.to_csv("carrier_AS_nullsremoved.csv", index=False)

In [165]:
B6 = pd.read_csv("carrier_B6.csv")
B6 = remove_nulls(B6)
B6.to_csv("carrier_B6_nullsremoved.csv", index=False)

In [166]:
DL = pd.read_csv("carrier_dl.csv")
DL = remove_nulls(DL)
DL.to_csv("carrier_DL_nullsremoved.csv", index=False)

In [167]:
F9 = pd.read_csv("carrier_F9.csv")
F9 = remove_nulls(F9)
F9.to_csv("carrier_F9_nullsremoved.csv", index=False)

In [168]:
G4 = pd.read_csv("carrier_G4.csv")
G4 = remove_nulls(G4)
G4.to_csv("carrier_G4_nullsremoved.csv", index=False)

In [169]:
HA = pd.read_csv("carrier_HA.csv")
HA = remove_nulls(HA)
HA.to_csv("carrier_HA_nullsremoved.csv", index=False)

In [170]:
NK = pd.read_csv("carrier_NK.csv")
NK = remove_nulls(NK)
NK.to_csv("carrier_NK_nullsremoved.csv", index=False)

In [171]:
UA = pd.read_csv("carrier_UA.csv")
UA = remove_nulls(UA)
UA.to_csv("carrier_UA_nullsremoved.csv", index=False)

In [172]:
VX = pd.read_csv("carrier_VX.csv")
VX = remove_nulls(VX)
VX.to_csv("carrier_VX_nullsremoved.csv", index=False)