## Identify existing anomalies from Fixed Wing dataset

In [1]:
import pandas as pd
import os

In [2]:
os.chdir('D:/GMU/1 Spring-2021/DAEN-690/Project/Dataset')

In [3]:
# Read fixed wing .csv file
fw_df = pd.read_csv('FW_with_airports(29Nov20-28Feb21 revised).csv', header='infer')
fw_df.head(5)

Unnamed: 0,tail_number,date,aircraft,origin,origin_location,destination,destination_location,departure,arrival,duration,...,COUNTRY_NAME_origin,STATE_NAME_origin,origin_state,ICAO_code_destination,iata_code_destination,AIRPORT_NAME_destination,CITY_NAME_destination,COUNTRY_NAME_destination,STATE_NAME_destination,destination_state
0,N7025P,28-Feb-21,,"Bend, OR",L 43.95194 -121.28694,,,04:59PM PST,,,...,United States,Oregon,OR,,,,,,,
1,N7025P,28-Feb-21,,Tews Field (CA53),"Tews Field (Redding, CA) - CA53","Bend, OR",L 43.81944 -121.37583,01:36PM PST,03:01PM PST,1:24,...,United States,California,CA,,OR4,Bend Municipal,Bend - OR,United States,Oregon,OR
2,N7025P,21-Feb-21,,"Lincoln, CA",L 38.93333 -121.41667,Redding Municipal (KRDD),"Redding Municipal (Redding, CA) - KRDD",03:02PM PST,03:49PM PST,0:47,...,United States,California,CA,KRDD,RDD,Redding Municipal,Redding - CA,United States,California,CA
3,N7025P,21-Feb-21,,"Red Bluff, CA",L 40.24000 -122.19083,"Marysville, CA",L 38.94750 -121.52250,12:46PM PST,01:20PM PST,0:33,...,United States,California,CA,KSMF,SMF,Sacramento International,Sacramento - CA,United States,California,CA
4,N7025P,24-Dec-20,,"Red Bluff, CA",L 40.28861 -122.05861,"Emigrant Gap, CA",L 39.77972 -120.52333,09:25AM PST,10:02AM PST,0:36,...,United States,California,CA,,,,,,,CA


In [4]:
fw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11241 entries, 0 to 11240
Data columns (total 29 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   tail_number               11241 non-null  object 
 1   date                      11241 non-null  object 
 2   aircraft                  11215 non-null  object 
 3   origin                    11241 non-null  object 
 4   origin_location           11241 non-null  object 
 5   destination               11239 non-null  object 
 6   destination_location      11239 non-null  object 
 7   departure                 11241 non-null  object 
 8   arrival                   11012 non-null  object 
 9   duration                  11009 non-null  object 
 10  medical_service           11241 non-null  object 
 11  origin_Latitude           10344 non-null  float64
 12  origin_Longitude          10344 non-null  float64
 13  destination_Latitude      10346 non-null  float64
 14  destin

In [5]:
# Check count of duplicate values for fixed wing dataset
fw_df.duplicated().sum()

0

In [6]:
# Check count of null values for the fixed wing dataset
fw_df.isna().sum()

tail_number                   0
date                          0
aircraft                     26
origin                        0
origin_location               0
destination                   2
destination_location          2
departure                     0
arrival                     229
duration                    232
medical_service               0
origin_Latitude             897
origin_Longitude            897
destination_Latitude        895
destination_Longitude       895
ICAO_code_origin             24
iata_code_origin             80
AIRPORT_NAME_origin          97
CITY_NAME_origin             97
COUNTRY_NAME_origin          97
STATE_NAME_origin           145
origin_state                 28
ICAO_code_destination        29
iata_code_destination        79
AIRPORT_NAME_destination     96
CITY_NAME_destination        96
COUNTRY_NAME_destination     96
STATE_NAME_destination      145
destination_state            29
dtype: int64

In [7]:
# Create a new dataframe with attributes tail_number, ICAO_code_origin, ICAO_code_destination
new_df = fw_df[['tail_number','ICAO_code_origin', 'ICAO_code_destination']]

In [8]:
# Check count of null values
new_df.isna().sum()

tail_number               0
ICAO_code_origin         24
ICAO_code_destination    29
dtype: int64

In [9]:
# Drop null values
new_df = new_df.dropna()

In [10]:
# Concat values and store in a new column Path
new_df['Path'] = new_df['ICAO_code_origin'] + new_df['ICAO_code_destination']

In [11]:
new_df

Unnamed: 0,tail_number,ICAO_code_origin,ICAO_code_destination,Path
2,N7025P,KSMF,KRDD,KSMFKRDD
3,N7025P,KRBL,KSMF,KRBLKSMF
5,N269GJ,KARR,KFXE,KARRKFXE
6,N888CP,KPIA,KARR,KPIAKARR
7,N888CP,KMDW,KPIA,KMDWKPIA
...,...,...,...,...
11236,N977TC,KELP,KHOU,KELPKHOU
11237,N977TC,KHOU,KELP,KHOUKELP
11238,N102WK,KMQY,KSHV,KMQYKSHV
11239,N102WK,KSHV,KMQY,KSHVKMQY


In [12]:
# Get total count of each Path 
res = pd.DataFrame(new_df.Path.value_counts().sort_values(ascending=True)).rename(columns={"Path":"PathFreq"}).reset_index()
res

Unnamed: 0,index,PathFreq
0,KMYFKBIH,1
1,KUNUKVNC,1
2,KHIOKACV,1
3,KFATKBIH,1
4,KSLNKDDC,1
...,...,...
3604,PHNLOGG / PHOG,72
3605,KAPAKGJT,84
3606,KABQKFMN,93
3607,KFMNKABQ,97


In [16]:
# Check if duplicates
res.duplicated().sum()

0

In [13]:
# Join 'res' dataframe with 'new_df' dataframe based on Path and index
new_df2 = new_df.merge(res, how='left', left_on='Path', right_on='index')
new_df2

Unnamed: 0,tail_number,ICAO_code_origin,ICAO_code_destination,Path,index,PathFreq
0,N7025P,KSMF,KRDD,KSMFKRDD,KSMFKRDD,1
1,N7025P,KRBL,KSMF,KRBLKSMF,KRBLKSMF,1
2,N269GJ,KARR,KFXE,KARRKFXE,KARRKFXE,1
3,N888CP,KPIA,KARR,KPIAKARR,KPIAKARR,1
4,N888CP,KMDW,KPIA,KMDWKPIA,KMDWKPIA,1
...,...,...,...,...,...,...
11188,N977TC,KELP,KHOU,KELPKHOU,KELPKHOU,9
11189,N977TC,KHOU,KELP,KHOUKELP,KHOUKELP,8
11190,N102WK,KMQY,KSHV,KMQYKSHV,KMQYKSHV,1
11191,N102WK,KSHV,KMQY,KSHVKMQY,KSHVKMQY,1


In [15]:
# Store dataframe in a csv file
new_df2.to_csv('D:/GMU/1 Spring-2021/DAEN-690/Project/Dataset/FW_anomaly.csv')