# Load Dataset

In [26]:
import pandas as pd
import os


curr_path = os.getcwd()
data_dir = os.path.join(curr_path, "Data")
file_name = 'flights_with_delay_data.csv'
file_path = os.path.join(data_dir, file_name)

df = pd.read_csv(file_path)
df

  df = pd.read_csv(file_path)


Unnamed: 0,TotalDelayDuration,ActualElapsedTime,AirTime,ArrDelay,ArrTime,CRSArrTime,CRSDepTime,CRSElapsedTime,CancellationCode,Cancelled,...,Month,NASDelay,Origin,SecurityDelay,TailNum,TaxiIn,TaxiOut,UniqueCarrier,WeatherDelay,Year
0,9.0,154.0,122.0,90.0,1850.0,1720,1455,145.0,,0,...,6,23.0,ORD,0.0,N293AA,8.0,24.0,AA,0.0,2006
1,-1.0,293.0,272.0,-1.0,1543.0,1544,750,294.0,,0,...,5,0.0,LAX,0.0,N788UA,8.0,13.0,UA,0.0,2007
2,1.0,121.0,101.0,-6.0,809.0,815,715,120.0,,0,...,3,0.0,HOU,0.0,N725SW,14.0,6.0,WN,0.0,2007
3,6.0,162.0,142.0,5.0,1001.0,956,820,156.0,,0,...,10,0.0,MEM,0.0,N970SW,9.0,11.0,OO,0.0,2006
4,-21.0,113.0,92.0,-25.0,1534.0,1559,1345,134.0,,0,...,8,0.0,ATL,0.0,N17620,11.0,10.0,CO,0.0,2004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2690083,7.0,127.0,111.0,14.0,1624.0,1610,1310,120.0,,0,...,12,0.0,ORD,0.0,N611AE,3.0,13.0,MQ,0.0,2006
2690084,12.0,228.0,205.0,20.0,1651.0,1631,955,216.0,,0,...,10,12.0,PHX,0.0,N803AW,10.0,13.0,US,0.0,2007
2690085,-19.0,183.0,164.0,-23.0,1604.0,1627,1405,202.0,,0,...,4,0.0,BWI,0.0,N26215,9.0,10.0,CO,0.0,2004
2690086,1.0,124.0,98.0,-6.0,1042.0,1048,745,123.0,,0,...,8,0.0,RNO,0.0,N492UA,16.0,10.0,UA,0.0,2006


# Data Preprocessing

### Delay Times

In [27]:
len(df.loc[df['ArrTime'].isna(), ['ArrTime', 'DepTime']])

5397

In [28]:
5397 / len(df)

0.002006254070498809

All NA rows consist of NA values in `ActualElapsedTime`, `AirTime`, `ArrDelay`, `ArrTime`

In [29]:
df.dropna(subset=['ActualElapsedTime', 'AirTime', 'ArrDelay', 'ArrTime'], inplace=True)
# test_df = df.dropna(subset=['ActualElapsedTime', 'AirTime', 'ArrDelay', 'ArrTime'])

In [30]:
df.isna().sum()

TotalDelayDuration          0
ActualElapsedTime           0
AirTime                     0
ArrDelay                    0
ArrTime                     0
CRSArrTime                  0
CRSDepTime                  0
CRSElapsedTime              0
CancellationCode      2684691
Cancelled                   0
CarrierDelay                0
DayOfWeek                   0
DayofMonth                  0
DepDelay                    0
DepTime                     0
Dest                        0
Distance                    0
Diverted                    0
FlightNum                   0
LateAircraftDelay           0
Month                       0
NASDelay                    0
Origin                      0
SecurityDelay               0
TailNum                     0
TaxiIn                      0
TaxiOut                     0
UniqueCarrier               0
WeatherDelay                0
Year                        0
dtype: int64

## Process Time Data

Add the following new columns to capture arr and dep times
- DayOfMonthArr
- DayOfMonthDep
- CRSDayOfMonthArr
- CRSDayOfMonthDep

</br>

The following would the the conditions / changes to be made
1. ARR and DEP Timings that are more than 2400, minus 24 and add 1 to arrival / dep date
2. CRSArrTime less than equal to 2359 and CRSDepTime < 1000 --> Add 1 day to CRS day of month Arr
3. Create datetime column

In [31]:
df.rename(columns={'DayofMonth':'DayOfMonthDep', 'Month':'MonthDep', 'Year':'YearDep'}, inplace=True)

In [32]:
df["DayOfMonthArr"] = df["DayOfMonthDep"]
df["MonthArr"] = df["MonthDep"]
df['YearArr'] = df['YearDep']

df['CRSDayOfMonthArr'] = df["DayOfMonthDep"]
df["CRSDayOfMonthDep"] = df["DayOfMonthDep"]
df["CRSMonthArr"] = df["MonthDep"]
df["CRSMonthDep"] = df["MonthDep"]
df['CRSYearArr'] = df['YearDep']
df['CRSYearDep'] = df['YearDep']

In [33]:
df_format = df[['DepTime', 'ArrTime', 'CRSDepTime', 'CRSArrTime', 'DayOfMonthDep', 'MonthDep', 'YearDep', 'DayOfMonthArr', 'MonthArr', 'YearArr',
       'CRSDayOfMonthArr', 'CRSMonthArr', 'CRSYearArr', 'CRSDayOfMonthDep', 'CRSMonthDep', 'CRSYearDep']]

df_format

Unnamed: 0,DepTime,ArrTime,CRSDepTime,CRSArrTime,DayOfMonthDep,MonthDep,YearDep,DayOfMonthArr,MonthArr,YearArr,CRSDayOfMonthArr,CRSMonthArr,CRSYearArr,CRSDayOfMonthDep,CRSMonthDep,CRSYearDep
0,1616.0,1850.0,1455,1720,21,6,2006,21,6,2006,21,6,2006,21,6,2006
1,750.0,1543.0,750,1544,8,5,2007,8,5,2007,8,5,2007,8,5,2007
2,708.0,809.0,715,815,16,3,2007,16,3,2007,16,3,2007,16,3,2007
3,819.0,1001.0,820,956,22,10,2006,22,10,2006,22,10,2006,22,10,2006
4,1341.0,1534.0,1345,1559,15,8,2004,15,8,2004,15,8,2004,15,8,2004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2690083,1317.0,1624.0,1310,1610,12,12,2006,12,12,2006,12,12,2006,12,12,2006
2690084,1003.0,1651.0,955,1631,24,10,2007,24,10,2007,24,10,2007,24,10,2007
2690085,1401.0,1604.0,1405,1627,2,4,2004,2,4,2004,2,4,2004,2,4,2004
2690086,738.0,1042.0,745,1048,30,8,2006,30,8,2006,30,8,2006,30,8,2006


In [34]:
df_format.loc[df_format["ArrTime"] >2400]

Unnamed: 0,DepTime,ArrTime,CRSDepTime,CRSArrTime,DayOfMonthDep,MonthDep,YearDep,DayOfMonthArr,MonthArr,YearArr,CRSDayOfMonthArr,CRSMonthArr,CRSYearArr,CRSDayOfMonthDep,CRSMonthDep,CRSYearDep
652,2348.0,2415.0,2150,2220,27,9,2004,27,9,2004,27,9,2004,27,9,2004
4926,2149.0,2401.0,2139,2335,2,4,2005,2,4,2005,2,4,2005,2,4,2005
10683,2350.0,2410.0,2150,2220,21,11,2005,21,11,2005,21,11,2005,21,11,2005
13148,2200.0,2414.0,1925,2133,16,2,2006,16,2,2006,16,2,2006,16,2,2006
14079,2330.0,2417.0,2340,38,13,9,2003,13,9,2003,13,9,2003,13,9,2003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2683533,2240.0,2409.0,2240,2317,5,2,2004,5,2,2004,5,2,2004,5,2,2004
2683793,2345.0,2454.0,2345,115,20,12,2003,20,12,2003,20,12,2003,20,12,2003
2686809,2234.0,2403.0,2234,2359,15,5,2005,15,5,2005,15,5,2005,15,5,2005
2689090,2336.0,2445.0,2255,16,19,3,2005,19,3,2005,19,3,2005,19,3,2005


In [35]:
df_format.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2684691 entries, 0 to 2690087
Data columns (total 16 columns):
 #   Column            Dtype  
---  ------            -----  
 0   DepTime           float64
 1   ArrTime           float64
 2   CRSDepTime        int64  
 3   CRSArrTime        int64  
 4   DayOfMonthDep     int64  
 5   MonthDep          int64  
 6   YearDep           int64  
 7   DayOfMonthArr     int64  
 8   MonthArr          int64  
 9   YearArr           int64  
 10  CRSDayOfMonthArr  int64  
 11  CRSMonthArr       int64  
 12  CRSYearArr        int64  
 13  CRSDayOfMonthDep  int64  
 14  CRSMonthDep       int64  
 15  CRSYearDep        int64  
dtypes: float64(2), int64(14)
memory usage: 348.2 MB


In [36]:
df_format[['DepTime', 'ArrTime', 'CRSDepTime', 'CRSArrTime']] = df_format[['DepTime', 'ArrTime', 'CRSDepTime', 'CRSArrTime']].astype(int).astype(str)
df_format['DepTime'] = df_format['DepTime'].str.zfill(4)
df_format['ArrTime'] = df_format['ArrTime'].str.zfill(4)
df_format['CRSDepTime'] = df_format['CRSDepTime'].str.zfill(4)
df_format['CRSArrTime'] = df_format['CRSArrTime'].str.zfill(4)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_format[['DepTime', 'ArrTime', 'CRSDepTime', 'CRSArrTime']] = df_format[['DepTime', 'ArrTime', 'CRSDepTime', 'CRSArrTime']].astype(int).astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_format['DepTime'] = df_format['DepTime'].str.zfill(4)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [37]:
df.loc[df['CRSArrTime'] == df['CRSDepTime']]

Unnamed: 0,TotalDelayDuration,ActualElapsedTime,AirTime,ArrDelay,ArrTime,CRSArrTime,CRSDepTime,CRSElapsedTime,CancellationCode,Cancelled,...,YearDep,DayOfMonthArr,MonthArr,YearArr,CRSDayOfMonthArr,CRSDayOfMonthDep,CRSMonthArr,CRSMonthDep,CRSYearArr,CRSYearDep
71,-3.0,57.0,48.0,0.0,800.0,800,800,60.0,,0,...,2007,10,2,2007,10,10,2,2,2007,2007
397,-3.0,57.0,41.0,-4.0,716.0,720,720,60.0,,0,...,2005,9,11,2005,9,9,11,11,2005,2005
2943,0.0,60.0,45.0,25.0,1525.0,1500,1500,60.0,,0,...,2006,2,1,2006,2,2,1,1,2006,2006
3000,-10.0,50.0,42.0,-10.0,1505.0,1515,1515,60.0,,0,...,2006,1,6,2006,1,1,6,6,2006,2006
3104,8.0,68.0,55.0,8.0,1028.0,1020,1020,60.0,,0,...,2004,3,9,2004,3,3,9,9,2004,2004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2689022,26.0,86.0,50.0,22.0,1942.0,1920,1920,60.0,,0,...,2005,6,4,2005,6,6,4,4,2005,2005
2689501,-4.0,56.0,48.0,-4.0,756.0,800,800,60.0,,0,...,2003,3,6,2003,3,3,6,6,2003,2003
2689614,18.0,78.0,44.0,28.0,1733.0,1705,1705,60.0,,0,...,2004,14,10,2004,14,14,10,10,2004,2004
2689624,-4.0,56.0,46.0,-9.0,706.0,715,715,60.0,,0,...,2007,7,5,2007,7,7,5,5,2007,2007


In [38]:
df.drop(df[df['CRSArrTime'] == df['CRSDepTime']].index, inplace=True)

In [39]:
df_format = df_format.drop(df_format[df_format['CRSArrTime'] == df_format['CRSDepTime']].index)

### Begin Cleaning! 

In [40]:
import time
import numpy as np

start_time = time.time()

#### Dep time

In [41]:
def adjust_dep_time(DepTime, DayOfMonthDep, MonthDep, YearDep):    
    month_day_dict = {
        1 : 31,
        2 : 28,
        3 : 31,
        4 : 30,
        5 : 31,
        6 : 30,
        7 : 31,
        8 : 31,
        9 : 30,
        10 : 31,
        11 : 30,
        12 : 31
    }            
            
    # Actual Dep
    if int(DepTime) >= 2400:
        DepTime = str(int(DepTime) - 2400)
        # If day of dep is the last day of the month
        if DayOfMonthDep == month_day_dict[MonthDep]:
            DayOfMonthDep = 1
            
            # If day of dep is on last day of the year
            if MonthDep == 12:
                MonthDep = 1
                YearDep += 1
            else:
                MonthDep += 1
        else:
            DayOfMonthDep += 1
    
    return f"{DayOfMonthDep}-{MonthDep}-{YearDep} {DepTime.zfill(4)}"

v_adjust_time = np.vectorize(adjust_dep_time, otypes=[object])

df_format['DepDateTime'] = v_adjust_time(df_format['DepTime'], df_format['DayOfMonthDep'], df_format['MonthDep'], df_format['YearDep'])

#### Arr time

In [42]:
def add_extra_time(ArrTime, DepTime):
    ArrTime1=ArrTime
    if int(ArrTime) < int(DepTime):
        ArrTime = str(int(ArrTime) + 2400)
    return ArrTime

v_adjust_time = np.vectorize(add_extra_time, otypes=[object])

df_format['ArrTimeAdded'] = v_adjust_time(df_format['ArrTime'], df_format['DepTime'])

In [43]:
def adjust_arr_time(ArrTime, DepTime, DayOfMonthArr, MonthArr, YearArr):    
    month_day_dict = {
        1 : 31,
        2 : 28,
        3 : 31,
        4 : 30,
        5 : 31,
        6 : 30,
        7 : 31,
        8 : 31,
        9 : 30,
        10 : 31,
        11 : 30,
        12 : 31
    }
        
    # Account for 2400 and above times
    if int(ArrTime) >= 2400:
        ArrTime = str(int(ArrTime) - 2400)
        
        if (int(ArrTime) >= 2400):
            return f"ERROR {ArrTime}"
        
        # If day of dep is the last day of the month
        if DayOfMonthArr == month_day_dict[MonthArr]:
            DayOfMonthArr = 1
            
            # If day of dep is on last day of the year
            if MonthArr == 12:
                MonthArr = 1
                YearArr += 1
            else:
                MonthArr += 1
        else:
            DayOfMonthArr += 1
    
    
    return f"{DayOfMonthArr}-{MonthArr}-{YearArr} {ArrTime.zfill(4)}"

v_adjust_time = np.vectorize(adjust_arr_time, otypes=[object])

df_format['ArrDateTime'] = v_adjust_time(df_format['ArrTimeAdded'], df_format['DepTime'], df_format['DayOfMonthArr'], df_format['MonthArr'], df_format['YearArr'])

#### CRS dep

In [44]:
def adjust_CRS_dep_time(CRSDepTime, CRSDayOfMonthDep, CRSMonthDep, CRSYearDep):    
    
    return f"{CRSDayOfMonthDep}-{CRSMonthDep}-{CRSYearDep} {CRSDepTime.zfill(4)}"

v_adjust_time = np.vectorize(adjust_CRS_dep_time, otypes=[object])

df_format['CRSDepDateTime'] = v_adjust_time(df_format['CRSDepTime'], df_format['CRSDayOfMonthDep'], df_format['CRSMonthDep'], df_format['CRSYearDep'])

#### CRS arr

In [45]:
def adjust_CRS_arr_time(CRSDepTime, CRSArrTime, CRSMonthDep, CRSDayOfMonthArr, CRSMonthArr, CRSYearArr):    
    month_day_dict = {
        1 : 31,
        2 : 28,
        3 : 31,
        4 : 30,
        5 : 31,
        6 : 30,
        7 : 31,
        8 : 31,
        9 : 30,
        10 : 31,
        11 : 30,
        12 : 31
    }

    # Change CRS
    # If CRS dep bfr 12am and land bfr 2359 the next day -> Adjust the CRS date
#     2359 -> 519
#     -2400
#     -41 -> -1881
    
#     1830 -> 2340
#     -570 -> -60
    if int(CRSDepTime) > int(CRSArrTime):
        
        # Check if day is invalid for given month (EG 32 aug etc)
        if CRSDayOfMonthArr >= month_day_dict[CRSMonthDep]: 
            CRSDayOfMonthArr = 1
            
            if CRSMonthArr == 12:
                CRSMonthArr = 1
                CRSYearArr += 1
                
            else:
                CRSMonthArr += 1
        else:
            CRSDayOfMonthArr += 1
    
    return f"{CRSDayOfMonthArr}-{CRSMonthArr}-{CRSYearArr} {CRSArrTime.zfill(4)}"

v_adjust_time = np.vectorize(adjust_CRS_arr_time, otypes=[object])

df_format['CRSArrDateTime'] = v_adjust_time(df_format['CRSDepTime'], df_format['CRSArrTime'], df_format['CRSMonthDep'], df_format['CRSDayOfMonthArr'], df_format['CRSMonthArr'], df_format['CRSYearArr'])

In [46]:
print(f"TOTAL TIME TAKEN: {(time.time() - start_time)}")

TOTAL TIME TAKEN: 18.455135345458984


In [58]:
df_to_add = df_format[['DepDateTime', 'ArrDateTime', 'CRSArrDateTime', 'CRSDepDateTime']]

df_cleaned = pd.concat([df, df_to_add], axis=1)
df_cleaned

Unnamed: 0,TotalDelayDuration,ActualElapsedTime,AirTime,ArrDelay,ArrTime,CRSArrTime,CRSDepTime,CRSElapsedTime,CancellationCode,Cancelled,...,CRSDayOfMonthArr,CRSDayOfMonthDep,CRSMonthArr,CRSMonthDep,CRSYearArr,CRSYearDep,DepDateTime,ArrDateTime,CRSArrDateTime,CRSDepDateTime
0,9.0,154.0,122.0,90.0,1850.0,1720,1455,145.0,,0,...,21,21,6,6,2006,2006,21-6-2006 1616,21-6-2006 1850,21-6-2006 1720,21-6-2006 1455
1,-1.0,293.0,272.0,-1.0,1543.0,1544,750,294.0,,0,...,8,8,5,5,2007,2007,8-5-2007 0750,8-5-2007 1543,8-5-2007 1544,8-5-2007 0750
2,1.0,121.0,101.0,-6.0,809.0,815,715,120.0,,0,...,16,16,3,3,2007,2007,16-3-2007 0708,16-3-2007 0809,16-3-2007 0815,16-3-2007 0715
3,6.0,162.0,142.0,5.0,1001.0,956,820,156.0,,0,...,22,22,10,10,2006,2006,22-10-2006 0819,22-10-2006 1001,22-10-2006 0956,22-10-2006 0820
4,-21.0,113.0,92.0,-25.0,1534.0,1559,1345,134.0,,0,...,15,15,8,8,2004,2004,15-8-2004 1341,15-8-2004 1534,15-8-2004 1559,15-8-2004 1345
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2690083,7.0,127.0,111.0,14.0,1624.0,1610,1310,120.0,,0,...,12,12,12,12,2006,2006,12-12-2006 1317,12-12-2006 1624,12-12-2006 1610,12-12-2006 1310
2690084,12.0,228.0,205.0,20.0,1651.0,1631,955,216.0,,0,...,24,24,10,10,2007,2007,24-10-2007 1003,24-10-2007 1651,24-10-2007 1631,24-10-2007 0955
2690085,-19.0,183.0,164.0,-23.0,1604.0,1627,1405,202.0,,0,...,2,2,4,4,2004,2004,2-4-2004 1401,2-4-2004 1604,2-4-2004 1627,2-4-2004 1405
2690086,1.0,124.0,98.0,-6.0,1042.0,1048,745,123.0,,0,...,30,30,8,8,2006,2006,30-8-2006 0738,30-8-2006 1042,30-8-2006 1048,30-8-2006 0745


In [59]:
df_cleaned.drop(columns=[
    'ArrTime',
    'DepTime',
    'DayOfMonthDep',
    'DayOfMonthArr',
    'MonthDep',
    'MonthArr',
    'YearDep',
    'YearArr',
    'CRSDayOfMonthArr',
    'CRSDayOfMonthDep',
    'CRSMonthArr',
    'CRSMonthDep',
    'CRSYearArr',
    'CRSYearDep'
], inplace = True)





In [60]:
df_cleaned

Unnamed: 0,TotalDelayDuration,ActualElapsedTime,AirTime,ArrDelay,CRSArrTime,CRSDepTime,CRSElapsedTime,CancellationCode,Cancelled,CarrierDelay,...,SecurityDelay,TailNum,TaxiIn,TaxiOut,UniqueCarrier,WeatherDelay,DepDateTime,ArrDateTime,CRSArrDateTime,CRSDepDateTime
0,9.0,154.0,122.0,90.0,1720,1455,145.0,,0,0.0,...,0.0,N293AA,8.0,24.0,AA,0.0,21-6-2006 1616,21-6-2006 1850,21-6-2006 1720,21-6-2006 1455
1,-1.0,293.0,272.0,-1.0,1544,750,294.0,,0,0.0,...,0.0,N788UA,8.0,13.0,UA,0.0,8-5-2007 0750,8-5-2007 1543,8-5-2007 1544,8-5-2007 0750
2,1.0,121.0,101.0,-6.0,815,715,120.0,,0,0.0,...,0.0,N725SW,14.0,6.0,WN,0.0,16-3-2007 0708,16-3-2007 0809,16-3-2007 0815,16-3-2007 0715
3,6.0,162.0,142.0,5.0,956,820,156.0,,0,0.0,...,0.0,N970SW,9.0,11.0,OO,0.0,22-10-2006 0819,22-10-2006 1001,22-10-2006 0956,22-10-2006 0820
4,-21.0,113.0,92.0,-25.0,1559,1345,134.0,,0,0.0,...,0.0,N17620,11.0,10.0,CO,0.0,15-8-2004 1341,15-8-2004 1534,15-8-2004 1559,15-8-2004 1345
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2690083,7.0,127.0,111.0,14.0,1610,1310,120.0,,0,0.0,...,0.0,N611AE,3.0,13.0,MQ,0.0,12-12-2006 1317,12-12-2006 1624,12-12-2006 1610,12-12-2006 1310
2690084,12.0,228.0,205.0,20.0,1631,955,216.0,,0,0.0,...,0.0,N803AW,10.0,13.0,US,0.0,24-10-2007 1003,24-10-2007 1651,24-10-2007 1631,24-10-2007 0955
2690085,-19.0,183.0,164.0,-23.0,1627,1405,202.0,,0,0.0,...,0.0,N26215,9.0,10.0,CO,0.0,2-4-2004 1401,2-4-2004 1604,2-4-2004 1627,2-4-2004 1405
2690086,1.0,124.0,98.0,-6.0,1048,745,123.0,,0,0.0,...,0.0,N492UA,16.0,10.0,UA,0.0,30-8-2006 0738,30-8-2006 1042,30-8-2006 1048,30-8-2006 0745


#### Remove anomalous data where Arr time is before Dep time

In [61]:
df_format.loc[df_format['ArrDateTime'].str.startswith("ERROR")]

Unnamed: 0,DepTime,ArrTime,CRSDepTime,CRSArrTime,DayOfMonthDep,MonthDep,YearDep,DayOfMonthArr,MonthArr,YearArr,...,CRSMonthArr,CRSYearArr,CRSDayOfMonthDep,CRSMonthDep,CRSYearDep,DepDateTime,ArrTimeAdded,ArrDateTime,CRSDepDateTime,CRSArrDateTime
188326,2416,2410,2228,2239,9,2,2005,9,2,2005,...,2,2005,9,2,2005,10-2-2005 0016,4810,ERROR 2410,9-2-2005 2228,9-2-2005 2239
258295,2419,2400,2332,2318,1,7,2005,1,7,2005,...,7,2005,1,7,2005,2-7-2005 0019,4800,ERROR 2400,1-7-2005 2332,2-7-2005 2318
296918,2440,2432,2332,2318,29,8,2005,29,8,2005,...,8,2005,29,8,2005,30-8-2005 0040,4832,ERROR 2432,29-8-2005 2332,30-8-2005 2318
984967,2440,2432,2250,2257,20,4,2006,20,4,2006,...,4,2006,20,4,2006,21-4-2006 0040,4832,ERROR 2432,20-4-2006 2250,20-4-2006 2257
1090918,2515,2502,2336,2328,27,3,2005,27,3,2005,...,3,2005,27,3,2005,28-3-2005 0115,4902,ERROR 2502,27-3-2005 2336,28-3-2005 2328
1186157,2537,2526,2200,2157,6,7,2005,6,7,2005,...,7,2005,6,7,2005,7-7-2005 0137,4926,ERROR 2526,6-7-2005 2200,7-7-2005 2157
1264957,2455,2444,2202,2205,1,7,2004,1,7,2004,...,7,2004,1,7,2004,2-7-2004 0055,4844,ERROR 2444,1-7-2004 2202,1-7-2004 2205
1304016,2435,2421,2339,2329,20,7,2005,20,7,2005,...,7,2005,20,7,2005,21-7-2005 0035,4821,ERROR 2421,20-7-2005 2339,21-7-2005 2329
1470269,2453,2436,2300,2248,18,9,2006,18,9,2006,...,9,2006,18,9,2006,19-9-2006 0053,4836,ERROR 2436,18-9-2006 2300,19-9-2006 2248
1520822,2412,2405,2250,2300,6,2,2006,6,2,2006,...,2,2006,6,2,2006,7-2-2006 0012,4805,ERROR 2405,6-2-2006 2250,6-2-2006 2300


In [62]:
df_cleaned.loc[df_cleaned['ArrDateTime'].str.startswith('ERROR')]

Unnamed: 0,TotalDelayDuration,ActualElapsedTime,AirTime,ArrDelay,CRSArrTime,CRSDepTime,CRSElapsedTime,CancellationCode,Cancelled,CarrierDelay,...,SecurityDelay,TailNum,TaxiIn,TaxiOut,UniqueCarrier,WeatherDelay,DepDateTime,ArrDateTime,CRSArrDateTime,CRSDepDateTime
188326,-17.0,54.0,-1397.0,91.0,2239,2228,71.0,,0,91.0,...,0.0,N912AS,1442.0,9.0,EV,0.0,10-2-2005 0016,ERROR 2410,9-2-2005 2239,9-2-2005 2228
258295,-5.0,41.0,30.0,42.0,2318,2332,46.0,,0,0.0,...,0.0,N861AS,3.0,8.0,EV,42.0,2-7-2005 0019,ERROR 2400,2-7-2005 2318,1-7-2005 2332
296918,6.0,52.0,-1402.0,74.0,2318,2332,46.0,,0,0.0,...,0.0,N833AS,1443.0,11.0,EV,68.0,30-8-2005 0040,ERROR 2432,30-8-2005 2318,29-8-2005 2332
984967,-15.0,52.0,-1399.0,95.0,2257,2250,67.0,,0,0.0,...,0.0,N857AS,1445.0,6.0,EV,95.0,21-4-2006 0040,ERROR 2432,20-4-2006 2257,20-4-2006 2250
1090918,-5.0,47.0,-1405.0,94.0,2328,2336,52.0,,0,0.0,...,0.0,N843AS,1442.0,10.0,EV,64.0,28-3-2005 0115,ERROR 2502,28-3-2005 2328,27-3-2005 2336
1186157,-8.0,49.0,-1407.0,209.0,2157,2200,57.0,,0,0.0,...,0.0,N906EV,1442.0,14.0,EV,209.0,7-7-2005 0137,ERROR 2526,7-7-2005 2157,6-7-2005 2200
1264957,-14.0,49.0,-1405.0,159.0,2205,2202,63.0,,0,0.0,...,0.0,N532AS,1444.0,10.0,EV,159.0,2-7-2004 0055,ERROR 2444,1-7-2004 2205,1-7-2004 2202
1304016,-4.0,46.0,-1411.0,52.0,2329,2339,50.0,,0,0.0,...,0.0,N929EV,1444.0,13.0,EV,52.0,21-7-2005 0035,ERROR 2421,21-7-2005 2329,20-7-2005 2339
1470269,-5.0,43.0,-1414.0,108.0,2248,2300,48.0,,0,0.0,...,0.0,N848AS,1445.0,12.0,EV,108.0,19-9-2006 0053,ERROR 2436,19-9-2006 2248,18-9-2006 2300
1520822,-17.0,53.0,-1396.0,65.0,2300,2250,70.0,,0,0.0,...,0.0,N820AS,1442.0,7.0,EV,65.0,7-2-2006 0012,ERROR 2405,6-2-2006 2300,6-2-2006 2250


In [64]:
df_cleaned

Unnamed: 0,TotalDelayDuration,ActualElapsedTime,AirTime,ArrDelay,CRSArrTime,CRSDepTime,CRSElapsedTime,CancellationCode,Cancelled,CarrierDelay,...,SecurityDelay,TailNum,TaxiIn,TaxiOut,UniqueCarrier,WeatherDelay,DepDateTime,ArrDateTime,CRSArrDateTime,CRSDepDateTime
0,9.0,154.0,122.0,90.0,1720,1455,145.0,,0,0.0,...,0.0,N293AA,8.0,24.0,AA,0.0,21-6-2006 1616,21-6-2006 1850,21-6-2006 1720,21-6-2006 1455
1,-1.0,293.0,272.0,-1.0,1544,750,294.0,,0,0.0,...,0.0,N788UA,8.0,13.0,UA,0.0,8-5-2007 0750,8-5-2007 1543,8-5-2007 1544,8-5-2007 0750
2,1.0,121.0,101.0,-6.0,815,715,120.0,,0,0.0,...,0.0,N725SW,14.0,6.0,WN,0.0,16-3-2007 0708,16-3-2007 0809,16-3-2007 0815,16-3-2007 0715
3,6.0,162.0,142.0,5.0,956,820,156.0,,0,0.0,...,0.0,N970SW,9.0,11.0,OO,0.0,22-10-2006 0819,22-10-2006 1001,22-10-2006 0956,22-10-2006 0820
4,-21.0,113.0,92.0,-25.0,1559,1345,134.0,,0,0.0,...,0.0,N17620,11.0,10.0,CO,0.0,15-8-2004 1341,15-8-2004 1534,15-8-2004 1559,15-8-2004 1345
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2690083,7.0,127.0,111.0,14.0,1610,1310,120.0,,0,0.0,...,0.0,N611AE,3.0,13.0,MQ,0.0,12-12-2006 1317,12-12-2006 1624,12-12-2006 1610,12-12-2006 1310
2690084,12.0,228.0,205.0,20.0,1631,955,216.0,,0,0.0,...,0.0,N803AW,10.0,13.0,US,0.0,24-10-2007 1003,24-10-2007 1651,24-10-2007 1631,24-10-2007 0955
2690085,-19.0,183.0,164.0,-23.0,1627,1405,202.0,,0,0.0,...,0.0,N26215,9.0,10.0,CO,0.0,2-4-2004 1401,2-4-2004 1604,2-4-2004 1627,2-4-2004 1405
2690086,1.0,124.0,98.0,-6.0,1048,745,123.0,,0,0.0,...,0.0,N492UA,16.0,10.0,UA,0.0,30-8-2006 0738,30-8-2006 1042,30-8-2006 1048,30-8-2006 0745


In [65]:
df_cleaned.drop(df_cleaned.loc[df_cleaned['ArrDateTime'].str.startswith('ERROR')].index, inplace=True)

#### Save to CSV

In [66]:
df_cleaned

Unnamed: 0,TotalDelayDuration,ActualElapsedTime,AirTime,ArrDelay,CRSArrTime,CRSDepTime,CRSElapsedTime,CancellationCode,Cancelled,CarrierDelay,...,SecurityDelay,TailNum,TaxiIn,TaxiOut,UniqueCarrier,WeatherDelay,DepDateTime,ArrDateTime,CRSArrDateTime,CRSDepDateTime
0,9.0,154.0,122.0,90.0,1720,1455,145.0,,0,0.0,...,0.0,N293AA,8.0,24.0,AA,0.0,21-6-2006 1616,21-6-2006 1850,21-6-2006 1720,21-6-2006 1455
1,-1.0,293.0,272.0,-1.0,1544,750,294.0,,0,0.0,...,0.0,N788UA,8.0,13.0,UA,0.0,8-5-2007 0750,8-5-2007 1543,8-5-2007 1544,8-5-2007 0750
2,1.0,121.0,101.0,-6.0,815,715,120.0,,0,0.0,...,0.0,N725SW,14.0,6.0,WN,0.0,16-3-2007 0708,16-3-2007 0809,16-3-2007 0815,16-3-2007 0715
3,6.0,162.0,142.0,5.0,956,820,156.0,,0,0.0,...,0.0,N970SW,9.0,11.0,OO,0.0,22-10-2006 0819,22-10-2006 1001,22-10-2006 0956,22-10-2006 0820
4,-21.0,113.0,92.0,-25.0,1559,1345,134.0,,0,0.0,...,0.0,N17620,11.0,10.0,CO,0.0,15-8-2004 1341,15-8-2004 1534,15-8-2004 1559,15-8-2004 1345
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2690083,7.0,127.0,111.0,14.0,1610,1310,120.0,,0,0.0,...,0.0,N611AE,3.0,13.0,MQ,0.0,12-12-2006 1317,12-12-2006 1624,12-12-2006 1610,12-12-2006 1310
2690084,12.0,228.0,205.0,20.0,1631,955,216.0,,0,0.0,...,0.0,N803AW,10.0,13.0,US,0.0,24-10-2007 1003,24-10-2007 1651,24-10-2007 1631,24-10-2007 0955
2690085,-19.0,183.0,164.0,-23.0,1627,1405,202.0,,0,0.0,...,0.0,N26215,9.0,10.0,CO,0.0,2-4-2004 1401,2-4-2004 1604,2-4-2004 1627,2-4-2004 1405
2690086,1.0,124.0,98.0,-6.0,1048,745,123.0,,0,0.0,...,0.0,N492UA,16.0,10.0,UA,0.0,30-8-2006 0738,30-8-2006 1042,30-8-2006 1048,30-8-2006 0745


In [67]:
df_cleaned.to_csv('df_cleaned_non_cancelled.csv')

### End of cleaning ^^^

In [71]:
(df.YearArr).unique()

array([2006, 2007, 2004, 2003, 2005, 2008], dtype=int64)