## Identify existing anomalies from Rotary Wing dataset

In [1]:
import pandas as pd
import os

In [2]:
os.chdir('D:/GMU/1 Spring-2021/DAEN-690/Project/Dataset')

In [3]:
# Read rotary wing .csv file
rw_df = pd.read_csv('RW_with_airports(29Nov2020-2Mar2021 revised).csv', header='infer')
rw_df.head(5)

Unnamed: 0,tail_number,date,aircraft,origin,origin_location,destination,destination_location,departure,arrival,duration,...,STATE_NAME_origin,origin_state,ICAO_code_destination,iata_code_destination,AIRPORT_CODE_destination,AIRPORT_NAME_destination,CITY_NAME_destination,COUNTRY_NAME_destination,STATE_NAME_destination,destination_state
0,N101AE,20-Feb-21,B06,Indy South Greenwood (KHFY),"Indy South Greenwood (Indianapolis, IN) - KHFY",Sanders Gyroport (IN88),"Sanders Gyroport (Cloverdale, IN) - IN88",03:56PM EST,04:15PM EST (?),0:18,...,Indiana,IN,IN88,IN7,IN7,Greenwood Municipal,Indianapolis - IN,United States,Indiana,IN
1,N101AE,20-Feb-21,B06,Owens Field (II29),"Owens Field (Greencastle, IN) - II29",Indy South Greenwood (KHFY),"Indy South Greenwood (Indianapolis, IN) - KHFY",03:01PM EST,03:21PM EST,0:20,...,Indiana,IN,KHFY,IN7,IN7,Indy South Greenwood,Indianapolis - IN,United States,Indiana,IN
2,N101AE,19-Feb-21,B06,Eagle Creek Airpark (KEYE),"Eagle Creek Airpark (Indianapolis, IN) - KEYE","Greencastle, IN",L 39.61981 -86.70046,02:38PM EST,02:50PM EST,0:11,...,Indiana,IN,,IN7,IN7,Greenwood Municipal,Indianapolis - IN,United States,Indiana,IN
3,N101AE,19-Feb-21,B06,"Terre Haute, IN",L 39.24110 -87.12544,Eagle Creek Airpark (KEYE),"Eagle Creek Airpark (Indianapolis, IN) - KEYE",12:30PM EST,02:11PM EST,1:40,...,Indiana,IN,KEYE,IN4,IN4,Eagle Creek Airpark,Indianapolis - IN,United States,Indiana,IN
4,N101AE,16-Feb-21,B06,"Terre Haute, IN",L 39.26091 -87.10094,"Indianapolis, IN",L 39.71667 -86.38333,01:51PM EST,03:48PM EST,1:56,...,Indiana,IN,KIND,IND,IND,Indianapolis International,Indianapolis - IN,United States,Indiana,IN


In [4]:
rw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116292 entries, 0 to 116291
Data columns (total 31 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   tail_number               116292 non-null  object 
 1   date                      116292 non-null  object 
 2   aircraft                  110178 non-null  object 
 3   origin                    116291 non-null  object 
 4   origin_location           116291 non-null  object 
 5   destination               116209 non-null  object 
 6   destination_location      116209 non-null  object 
 7   departure                 116292 non-null  object 
 8   arrival                   116096 non-null  object 
 9   duration                  116060 non-null  object 
 10  NAME                      116292 non-null  object 
 11  origin_Latitude           107183 non-null  float64
 12  origin_Longitude          107183 non-null  float64
 13  destination_Latitude      107173 non-null  f

In [5]:
# Check count of duplicate values for rotary wing dataset
rw_df.isna().sum()

tail_number                    0
date                           0
aircraft                    6114
origin                         1
origin_location                1
destination                   83
destination_location          83
departure                      0
arrival                      196
duration                     232
NAME                           0
origin_Latitude             9109
origin_Longitude            9109
destination_Latitude        9119
destination_Longitude       9119
ICAO_code_origin            8503
iata_code_origin            4496
AIRPORT_CODE_origin         5175
AIRPORT_NAME_origin         5175
CITY_NAME_origin            5175
COUNTRY_NAME_origin         5175
STATE_NAME_origin           5175
origin_state                 756
ICAO_code_destination       7581
iata_code_destination       4504
AIRPORT_CODE_destination    5352
AIRPORT_NAME_destination    5352
CITY_NAME_destination       5352
COUNTRY_NAME_destination    5352
STATE_NAME_destination      5352
destinatio

In [6]:
# Create a new dataframe with attributes tail_number, ICAO_code_origin, ICAO_code_destination
new_df = rw_df[['tail_number','ICAO_code_origin', 'ICAO_code_destination']]

In [7]:
# Check count of null values
new_df.isna().sum()

tail_number                 0
ICAO_code_origin         8503
ICAO_code_destination    7581
dtype: int64

In [8]:
# Drop null values
new_df = new_df.dropna()

In [9]:
# Concat values and store in a new column Path
new_df['Path'] = new_df['ICAO_code_origin'] + new_df['ICAO_code_destination']

In [10]:
new_df

Unnamed: 0,tail_number,ICAO_code_origin,ICAO_code_destination,Path
0,N101AE,KHFY,IN88,KHFYIN88
1,N101AE,II29,KHFY,II29KHFY
3,N101AE,KHUF,KEYE,KHUFKEYE
4,N101AE,KHUF,KIND,KHUFKIND
6,N101AE,05IN,KIND,05INKIND
...,...,...,...,...
116285,N135MH,KFME,KDCA,KFMEKDCA
116286,N135MH,2W5,KFME,2W5KFME
116287,N135MH,2W5,2W5,2W52W5
116288,N135MH,2W5,2W5,2W52W5


In [11]:
# Get total count of each Path 
res = pd.DataFrame(new_df.Path.value_counts().sort_values(ascending=True)).rename(columns={"Path":"PathFreq"}).reset_index()
res

Unnamed: 0,index,PathFreq
0,29DKBFD,1
1,KAKRKCLE,1
2,KGMUKSPA,1
3,N45KDXR,1
4,KDUGKCGZ,1
...,...,...
16285,KGLSKGLS,474
16286,KABQKABQ,519
16287,KBADKHUM,573
16288,KHUMKBAD,582


In [12]:
# Check if duplicates
res.duplicated().sum()

0

In [13]:
# Join 'res' dataframe with 'new_df' dataframe based on Path and index
new_df2 = new_df.merge(res, how='left', left_on='Path', right_on='index')
new_df2

Unnamed: 0,tail_number,ICAO_code_origin,ICAO_code_destination,Path,index,PathFreq
0,N101AE,KHFY,IN88,KHFYIN88,KHFYIN88,1
1,N101AE,II29,KHFY,II29KHFY,II29KHFY,1
2,N101AE,KHUF,KEYE,KHUFKEYE,KHUFKEYE,1
3,N101AE,KHUF,KIND,KHUFKIND,KHUFKIND,3
4,N101AE,05IN,KIND,05INKIND,05INKIND,1
...,...,...,...,...,...,...
102259,N135MH,KFME,KDCA,KFMEKDCA,KFMEKDCA,22
102260,N135MH,2W5,KFME,2W5KFME,2W5KFME,4
102261,N135MH,2W5,2W5,2W52W5,2W52W5,6
102262,N135MH,2W5,2W5,2W52W5,2W52W5,6


In [14]:
# Store dataframe in a csv file
new_df2.to_csv('D:/GMU/1 Spring-2021/DAEN-690/Project/Dataset/RW_anomaly.csv')