# Combine sources

## Setup

In [1]:
import pathlib 

import pandas as pd
import numpy as np

## Reading raw data

### 1st source (https://www.kaggle.com/datasets/robikscube/flight-delay-dataset-20182022)

In [2]:
source_1_dir: pathlib.Path = pathlib.Path('../datasets/raw/source_1/')

parquet_files: list[pathlib.Path] = list(source_1_dir.glob("Combined_Flights_[0-9]*.parquet"))

source_1_df: pd.DataFrame = pd.concat((pd.read_parquet(parquet_file) for parquet_file in parquet_files), ignore_index=True)

#### Cleaning

City name is in the format "city,state", we remove the state part because it is given by another feature.

In [3]:
source_1_df['DestCityName'] = source_1_df['DestCityName'].apply(lambda x: x.split(',')[0])
source_1_df['OriginCityName'] = source_1_df['OriginCityName'].apply(lambda x: x.split(',')[0])

### 2nd source (https://community.amstat.org/jointscsg-section/dataexpo/dataexpo2009)


In [4]:
source_2_dir: pathlib.Path = pathlib.Path('../datasets/raw/source_2/')

csv_files: list[pathlib.Path] = list(source_2_dir.glob("[0-9]*.csv"))

source_2_df: pd.DataFrame = pd.concat((pd.read_csv(csv_file, encoding='latin-1') for csv_file in csv_files), ignore_index=True)

  source_2_df: pd.DataFrame = pd.concat((pd.read_csv(csv_file, encoding='latin-1') for csv_file in csv_files), ignore_index=True)


#### Joining with airport information

In [5]:
source_2_airports_df: pd.DataFrame = pd.read_csv(source_2_dir / 'airports.csv', encoding='latin-1')
source_2_df = source_2_df.merge(source_2_airports_df[['iata', 'airport', 'city', 'state']], left_on='Origin', right_on='iata', how='left', suffixes=('', '_origin'))
source_2_df = source_2_df.rename({
    'iata': 'OriginIata', 
    'airport': 'OriginAirport', 
    'city': 'OriginCityName', 
    'state': 'OriginState'}, axis=1)
source_2_df = source_2_df.merge(source_2_airports_df[['iata', 'airport', 'city', 'state']], left_on='Dest', right_on='iata', how='left', suffixes=('', '_dest'))
source_2_df = source_2_df.rename({
    'iata': 'DestIata',
    'airport': 'DestAirport',
    'city': 'DestCityName',
    'state': 'DestState'}, axis=1)

### Joining with airline information

In [6]:
source_2_carriers_df: pd.DataFrame = pd.read_csv(source_2_dir / 'carriers.csv', encoding='latin-1')
source_2_carriers_df.set_index('Code', inplace=True)

source_2_df['Airline'] = source_2_df['UniqueCarrier'].apply(lambda x: source_2_carriers_df.loc[x, 'Description'])

#### Renaming features to match the first source

In [7]:
source_2_df.rename({'TailNum': 'Tail_Number'}, axis=1, inplace=True)

## Features unique to sources
The following features are unique to the sources, and therefore will not be in the final dataset.

In [8]:
non_overlapping_features: set[str] = set(source_1_df.columns) ^ set(source_2_df.columns)
print("\n".join(non_overlapping_features))

DestStateFips
SecurityDelay
DepartureDelayGroups
ArrTimeBlk
OriginCityMarketID
UniqueCarrier
LateAircraftDelay
DepDelayMinutes
OriginAirportID
DepDel15
Marketing_Airline_Network
WheelsOn
DestIata
OriginStateName
OriginAirport
DistanceGroup
NASDelay
FlightNum
CarrierDelay
OriginWac
FlightDate
WeatherDelay
DestCityMarketID
DestWac
DestStateName
ArrDelayMinutes
Flight_Number_Operating_Airline
OriginStateFips
DepTimeBlk
ArrivalDelayGroups
IATA_Code_Operating_Airline
DOT_ID_Marketing_Airline
Operating_Airline
Flight_Number_Marketing_Airline
IATA_Code_Marketing_Airline
DestAirportSeqID
DestAirportID
DOT_ID_Operating_Airline
ArrDel15
CancellationCode
OriginIata
Quarter
DivAirportLandings
Operated_or_Branded_Code_Share_Partners
OriginAirportSeqID
WheelsOff
DestAirport


## Features present in both sources

In [9]:
overlapping_features: set[str] = set(source_1_df.columns) & set(source_2_df.columns)
print("\n".join(overlapping_features))

DayofMonth
ArrTime
ArrDelay
OriginCityName
Dest
Month
DepTime
CRSDepTime
TaxiOut
AirTime
Origin
Year
CRSArrTime
TaxiIn
ActualElapsedTime
Airline
OriginState
Tail_Number
DestCityName
DestState
Diverted
DepDelay
CRSElapsedTime
Distance
Cancelled
DayOfWeek


## Combining sources

In [10]:
data_df: pd.DataFrame = pd.concat([source_1_df[list(overlapping_features)], source_2_df[list(overlapping_features)]], ignore_index=True)

data_df

Unnamed: 0,DayofMonth,ArrTime,ArrDelay,OriginCityName,Dest,Month,DepTime,CRSDepTime,TaxiOut,AirTime,...,OriginState,Tail_Number,DestCityName,DestState,Diverted,DepDelay,CRSElapsedTime,Distance,Cancelled,DayOfWeek
0,23,1256.0,-8.0,Albany,ATL,1,1157.0,1202,14.0,38.0,...,GA,N8928A,Atlanta,GA,False,-5.0,62.0,145.0,False,2
1,24,1258.0,-6.0,Albany,ATL,1,1157.0,1202,13.0,36.0,...,GA,N800AY,Atlanta,GA,False,-5.0,62.0,145.0,False,3
2,25,1302.0,-2.0,Albany,ATL,1,1153.0,1202,18.0,40.0,...,GA,N8836A,Atlanta,GA,False,-9.0,62.0,145.0,False,4
3,26,1253.0,-11.0,Albany,ATL,1,1150.0,1202,17.0,35.0,...,GA,N800AY,Atlanta,GA,False,-12.0,62.0,145.0,False,5
4,27,1459.0,-1.0,Albany,ATL,1,1355.0,1400,17.0,36.0,...,GA,N8839E,Atlanta,GA,False,-5.0,60.0,145.0,False,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148108235,12,2035.0,63.0,Covington,PHL,12,1858.0,1758,,,...,KY,,Philadelphia,PA,0,60.0,94.0,507.0,0,2
148108236,13,1945.0,13.0,Covington,PHL,12,1807.0,1758,,,...,KY,,Philadelphia,PA,0,9.0,94.0,507.0,0,3
148108237,14,1923.0,-9.0,Covington,PHL,12,1758.0,1758,,,...,KY,,Philadelphia,PA,0,0.0,94.0,507.0,0,4
148108238,15,1937.0,7.0,Covington,PHL,12,1810.0,1800,,,...,KY,,Philadelphia,PA,0,10.0,90.0,507.0,0,5


## Convert column types

In [11]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148108240 entries, 0 to 148108239
Data columns (total 26 columns):
 #   Column             Dtype  
---  ------             -----  
 0   DayofMonth         int64  
 1   ArrTime            float64
 2   ArrDelay           float64
 3   OriginCityName     object 
 4   Dest               object 
 5   Month              int64  
 6   DepTime            float64
 7   CRSDepTime         int64  
 8   TaxiOut            float64
 9   AirTime            float64
 10  Origin             object 
 11  Year               int64  
 12  CRSArrTime         int64  
 13  TaxiIn             float64
 14  ActualElapsedTime  float64
 15  Airline            object 
 16  OriginState        object 
 17  Tail_Number        object 
 18  DestCityName       object 
 19  DestState          object 
 20  Diverted           object 
 21  DepDelay           float64
 22  CRSElapsedTime     float64
 23  Distance           float64
 24  Cancelled          object 
 25  DayOfWeek     

### Boolean columns

We suspect that the columns Diverted and Cancelled only contain boolean values.
We confirm it by using `value_counts()` and convert them to boolean type.

In [12]:
data_df['Diverted'].value_counts()

False    147767298
True        340942
Name: Diverted, dtype: int64

In [13]:
data_df['Cancelled'].value_counts()

False    145100641
True       3007599
Name: Cancelled, dtype: int64

In [14]:
boolean_columns: list[str] = ['Diverted', 'Cancelled']

for column in boolean_columns:
    data_df[column] = data_df[column].astype("bool")

In [15]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148108240 entries, 0 to 148108239
Data columns (total 26 columns):
 #   Column             Dtype  
---  ------             -----  
 0   DayofMonth         int64  
 1   ArrTime            float64
 2   ArrDelay           float64
 3   OriginCityName     object 
 4   Dest               object 
 5   Month              int64  
 6   DepTime            float64
 7   CRSDepTime         int64  
 8   TaxiOut            float64
 9   AirTime            float64
 10  Origin             object 
 11  Year               int64  
 12  CRSArrTime         int64  
 13  TaxiIn             float64
 14  ActualElapsedTime  float64
 15  Airline            object 
 16  OriginState        object 
 17  Tail_Number        object 
 18  DestCityName       object 
 19  DestState          object 
 20  Diverted           bool   
 21  DepDelay           float64
 22  CRSElapsedTime     float64
 23  Distance           float64
 24  Cancelled          bool   
 25  DayOfWeek     

### Categorical columns

In [16]:
categorical_columns: list[str] = ['DayOfWeek', 'OriginCityName', 'OriginState', 
                                  'DestState', 'Tail_Number', 'Dest', 'DayofMonth', 
                                  'Airline', 'DestCityName', 'Origin', 'Year', 'Month']
                                
for column in categorical_columns:
    data_df[column] = data_df[column].astype("category")

In [17]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148108240 entries, 0 to 148108239
Data columns (total 26 columns):
 #   Column             Dtype   
---  ------             -----   
 0   DayofMonth         category
 1   ArrTime            float64 
 2   ArrDelay           float64 
 3   OriginCityName     category
 4   Dest               category
 5   Month              category
 6   DepTime            float64 
 7   CRSDepTime         int64   
 8   TaxiOut            float64 
 9   AirTime            float64 
 10  Origin             category
 11  Year               category
 12  CRSArrTime         int64   
 13  TaxiIn             float64 
 14  ActualElapsedTime  float64 
 15  Airline            category
 16  OriginState        category
 17  Tail_Number        category
 18  DestCityName       category
 19  DestState          category
 20  Diverted           bool    
 21  DepDelay           float64 
 22  CRSElapsedTime     float64 
 23  Distance           float64 
 24  Cancelled          b

In [None]:
data_df.info()

In [20]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148108240 entries, 0 to 148108239
Data columns (total 26 columns):
 #   Column             Dtype   
---  ------             -----   
 0   DayofMonth         category
 1   ArrTime            float64 
 2   ArrDelay           float64 
 3   OriginCityName     category
 4   Dest               category
 5   Month              category
 6   DepTime            float64 
 7   CRSDepTime         uint16  
 8   TaxiOut            float64 
 9   AirTime            float64 
 10  Origin             category
 11  Year               category
 12  CRSArrTime         uint16  
 13  TaxiIn             float64 
 14  ActualElapsedTime  float64 
 15  Airline            category
 16  OriginState        category
 17  Tail_Number        category
 18  DestCityName       category
 19  DestState          category
 20  Diverted           bool    
 21  DepDelay           float64 
 22  CRSElapsedTime     float64 
 23  Distance           float64 
 24  Cancelled          b

## Save the combined dataset

In [21]:
data_df.to_parquet('../datasets/raw/combined.parquet')