In [35]:
import pandas as pd

In [36]:
#install: pip install pivottablejs

from pivottablejs import pivot_ui

In [37]:
# Read the CSV file into a DataFrame
df = pd.read_csv('net_income.csv')

# Set 'Airline' column as index
df.set_index('Airline', inplace=True)

# Reset index to default
df.reset_index(inplace=True)

# Filter out 'Unnamed' columns
valid_columns = [col for col in df.columns if 'Unnamed' not in col]

# Select only valid columns in the DataFrame
df = df[valid_columns]

# Rename columns to 'AIRLINE' and years
df.columns = ['CARRIER'] + [str(year) for year in df.columns[1:]]

# Melt DataFrame to reshape it
income_df = pd.melt(df, id_vars=['CARRIER'], value_vars=df.columns[1:], var_name='YEAR', value_name='NET_INCOME')

# Convert 'NET_INCOME' values to float type
income_df['NET_INCOME'] = pd.to_numeric(income_df['NET_INCOME'], errors='coerce')

# Reset index to make 'YEAR' a regular column
income_df.reset_index(inplace=True)

# Drop the old index column
income_df.drop(columns=['index'], inplace=True)

# Reorder columns to move 'YEAR' to the leftmost position
income_df = income_df[['YEAR', 'CARRIER', 'NET_INCOME']]

# Convert 'YEAR' column to integers
income_df['YEAR'] = income_df['YEAR'].astype(int)

# Only display years 2009-2019
income_df = income_df[(income_df['YEAR'] >= 2009) & (income_df['YEAR'] <= 2019)]

# Display the resulting DataFrame
display(income_df)


Unnamed: 0,YEAR,CARRIER,NET_INCOME
90,2009,American,1.470000e+09
91,2009,Delta,1.240000e+09
92,2009,United,-6.510000e+08
93,2009,Southwest,9.900000e+07
94,2009,JetBlue,5.800000e+07
...,...,...,...
195,2019,Alaska,7.690000e+08
196,2019,Hawaiian,2.240000e+08
197,2019,Frontier,2.510000e+08
198,2019,Spirit,3.350000e+08


In [38]:
income_df.fillna({'NET_INCOME': 0}, inplace = True)

In [39]:
income_df

Unnamed: 0,YEAR,CARRIER,NET_INCOME
90,2009,American,1.470000e+09
91,2009,Delta,1.240000e+09
92,2009,United,-6.510000e+08
93,2009,Southwest,9.900000e+07
94,2009,JetBlue,5.800000e+07
...,...,...,...
195,2019,Alaska,7.690000e+08
196,2019,Hawaiian,2.240000e+08
197,2019,Frontier,2.510000e+08
198,2019,Spirit,3.350000e+08


In [40]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv('passengers.csv')

# Set 'Airline' column as index
df.set_index('Airlines', inplace=True)

# Reset index to default
df.reset_index(inplace=True)

# Filter out 'Unnamed' columns
valid_columns = [col for col in df.columns if 'Unnamed' not in col]

# Select only valid columns in the DataFrame
df = df[valid_columns]

# Rename columns to 'AIRLINE' and years
df.columns = ['CARRIER'] + [str(year) for year in df.columns[1:]]

# Melt DataFrame to reshape it
peeps_df = pd.melt(df, id_vars=['CARRIER'], value_vars=df.columns[1:], var_name='YEAR', value_name='TOTAL_PASSENGERS')

# Reset index to make 'YEAR' a regular column
peeps_df.reset_index(inplace=True)

# Drop the old index column
peeps_df.drop(columns=['index'], inplace=True)

# Reorder columns to move 'YEAR' to the leftmost position
peeps_df = peeps_df[['YEAR', 'CARRIER', 'TOTAL_PASSENGERS']]

# Convert 'YEAR' column to integers
peeps_df['YEAR'] = peeps_df['YEAR'].astype(int)

# Only display years 2009-2019
peeps_df = peeps_df[(peeps_df['YEAR'] >= 2009) & (peeps_df['YEAR'] <= 2019)]

# Display the resulting DataFrame
display(peeps_df)


Unnamed: 0,YEAR,CARRIER,TOTAL_PASSENGERS
140,2009,American,85783000
141,2009,Delta,67852000
142,2009,United,56083000
143,2009,Southwest,101430000
144,2009,JetBlue,22392000
...,...,...,...
245,2019,Alaska,106590000
246,2019,Hawaiian,34594000
247,2019,Allegiant,45366000
248,2019,Frontier,68061000


In [41]:
peeps_df.fillna({'TOTAL_PASSENGERS': 0}, inplace = True)

In [42]:
peeps_df

Unnamed: 0,YEAR,CARRIER,TOTAL_PASSENGERS
140,2009,American,85783000
141,2009,Delta,67852000
142,2009,United,56083000
143,2009,Southwest,101430000
144,2009,JetBlue,22392000
...,...,...,...
245,2019,Alaska,106590000
246,2019,Hawaiian,34594000
247,2019,Allegiant,45366000
248,2019,Frontier,68061000


In [43]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv('rpms.csv')

# Set 'Airline' column as index
df.set_index('Airline', inplace=True)

# Reset index to default
df.reset_index(inplace=True)

# Filter out 'Unnamed' columns
valid_columns = [col for col in df.columns if 'Unnamed' not in col]

# Select only valid columns in the DataFrame
df = df[valid_columns]

# Rename columns to 'AIRLINE' and years
df.columns = ['CARRIER'] + [str(year) for year in df.columns[1:]]

# Melt DataFrame to reshape it
rpms_df = pd.melt(df, id_vars=['CARRIER'], value_vars=df.columns[1:], var_name='YEAR', value_name='TOTAL_RPMS')

# Convert 'NET_INCOME' values to float type
rpms_df['TOTAL_RPMS'] = pd.to_numeric(rpms_df['TOTAL_RPMS'], errors='coerce')

# Reset index to make 'YEAR' a regular column
rpms_df.reset_index(inplace=True)

# Drop the old index column
rpms_df.drop(columns=['index'], inplace=True)

# Reorder columns to move 'YEAR' to the leftmost position
rpms_df = rpms_df[['YEAR', 'CARRIER', 'TOTAL_RPMS']]

# Convert 'YEAR' column to integers
rpms_df['YEAR'] = rpms_df['YEAR'].astype(int)

# Only display years 2009-2019
rpms_df = rpms_df[(rpms_df['YEAR'] >= 2009) & (rpms_df['YEAR'] <= 2019)]

# Display the resulting DataFrame
display(rpms_df)


Unnamed: 0,YEAR,CARRIER,TOTAL_RPMS
140,2009,American,1.224230e+11
141,2009,Delta,1.007360e+11
142,2009,United,1.004750e+11
143,2009,Southwest,7.457600e+10
144,2009,JetBlue,2.596200e+10
...,...,...,...
245,2019,Alaska,5.041600e+10
246,2019,Hawaiian,1.781000e+10
247,2019,Allegiant,1.334000e+10
248,2019,Spirit,3.524500e+10


In [44]:
rpms_df.fillna({'TOTAL_RPMS': 0}, inplace = True)

In [45]:
rpms_df

Unnamed: 0,YEAR,CARRIER,TOTAL_RPMS
140,2009,American,1.224230e+11
141,2009,Delta,1.007360e+11
142,2009,United,1.004750e+11
143,2009,Southwest,7.457600e+10
144,2009,JetBlue,2.596200e+10
...,...,...,...
245,2019,Alaska,5.041600e+10
246,2019,Hawaiian,1.781000e+10
247,2019,Allegiant,1.334000e+10
248,2019,Spirit,3.524500e+10


In [46]:
# Reorder columns to move 'YEAR' to the leftmost position
rpms_df = rpms_df[['YEAR', 'CARRIER', 'TOTAL_RPMS']]

# Reorder columns to move 'YEAR' to the leftmost position
peeps_df = peeps_df[['YEAR', 'CARRIER', 'TOTAL_PASSENGERS']]

# Reorder columns to move 'YEAR' to the leftmost position
income_df = income_df[['YEAR', 'CARRIER', 'NET_INCOME']]

# Merge dataframes using 'YEAR' and 'CARRIER' as keys
combined_df = pd.merge(income_df, peeps_df, on=['YEAR', 'CARRIER'])
combined_df = pd.merge(combined_df, rpms_df, on=['YEAR', 'CARRIER'])

# Display the dataframe
display(combined_df)



Unnamed: 0,YEAR,CARRIER,NET_INCOME,TOTAL_PASSENGERS,TOTAL_RPMS
0,2009,American,1.470000e+09,85783000,1.224230e+11
1,2009,Delta,1.240000e+09,67852000,1.007360e+11
2,2009,United,-6.510000e+08,56083000,1.004750e+11
3,2009,Southwest,9.900000e+07,101430000,7.457600e+10
4,2009,JetBlue,5.800000e+07,22392000,2.596200e+10
...,...,...,...,...,...
105,2019,Alaska,7.690000e+08,106590000,5.041600e+10
106,2019,Hawaiian,2.240000e+08,34594000,1.781000e+10
107,2019,Frontier,2.510000e+08,68061000,2.419200e+10
108,2019,Spirit,3.350000e+08,101604000,3.524500e+10


In [47]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 110 entries, 0 to 109
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   YEAR              110 non-null    int32  
 1   CARRIER           110 non-null    object 
 2   NET_INCOME        110 non-null    float64
 3   TOTAL_PASSENGERS  110 non-null    int64  
 4   TOTAL_RPMS        110 non-null    float64
dtypes: float64(2), int32(1), int64(1), object(1)
memory usage: 4.7+ KB


In [48]:
combined_df.describe()

Unnamed: 0,YEAR,NET_INCOME,TOTAL_PASSENGERS,TOTAL_RPMS
count,110.0,110.0,110.0,110.0
mean,2014.0,919590900.0,162617300.0,72947080000.0
std,3.17675,1744065000.0,157897500.0,70980810000.0
min,2009.0,-1970000000.0,5320000.0,4738000000.0
25%,2011.0,111750000.0,34072750.0,13735250000.0
50%,2014.0,252000000.0,78725500.0,34411000000.0
75%,2017.0,1025000000.0,314397000.0,127930500000.0
max,2019.0,10500000000.0,491026000.0,217905000000.0


In [49]:
wages_df = pd.read_csv('Wages_Summary_Cleaned.csv')
wages_df

Unnamed: 0,WAGE_CAT,CARRIER,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,All,Alaska,71780,71588,71745,74091,77415,76413,75289,76338,75390,85641,89280
1,All,Allegiant,41068,49889,55237,53630,54670,65064,59214,64869,75645,73152,72363
2,All,American,62961,63916,63953,64576,68523,70486,73959,82236,89002,90791,93247
3,All,Delta,59629,61814,61021,67021,76485,86138,93441,97950,99416,101149,110039
4,All,Frontier,38379,44403,52251,46083,54795,52667,78445,70675,80136,88137,88133
5,All,Hawaiian,53235,56194,54236,53896,57069,60477,63762,71387,74007,69706,73574
6,All,JetBlue,58563,62811,64531,65676,68676,72742,73328,73834,81896,86896,94167
7,All,Southwest,77996,80408,82847,81313,86087,88416,94107,94063,93623,95599,98878
8,All,Spirit,55786,52094,57360,46475,54040,58384,58852,62951,57010,66103,73019
9,All,United,58239,65011,61214,63215,70075,74445,82277,86371,98980,102069,103852


In [50]:
import pandas as pd

# Read the original CSV file
df = pd.read_csv('Wages_Summary_Cleaned.csv')

# Create an empty list to store the modified data
rows = []

# Iterate over each row in the original DataFrame
for index, row in df.iterrows():
    # Extract CARRIER and WAGE_CAT
    carrier = row['CARRIER']
    wage_cat = row['WAGE_CAT']
    
    # Iterate over each year and corresponding pay
    for year in range(2009, 2020):  # Assuming years range from 2009 to 2019
        pay = row[str(year)]
        rows.append({'YEAR': year, 'CARRIER': carrier, 'WAGE_CAT': wage_cat, 'PAY': pay})

# Create a DataFrame from the list of rows
modified_df = pd.DataFrame(rows)

# Save the modified DataFrame to a new CSV file
modified_df.to_csv('modified_wage_data.csv', index=False)

# Display the first few rows of the modified DataFrame
print(modified_df.head())


   YEAR CARRIER WAGE_CAT    PAY
0  2009  Alaska      All  71780
1  2010  Alaska      All  71588
2  2011  Alaska      All  71745
3  2012  Alaska      All  74091
4  2013  Alaska      All  77415


In [51]:
wage_df = pd.read_csv('modified_wage_data.csv')
wage_df

Unnamed: 0,YEAR,CARRIER,WAGE_CAT,PAY
0,2009,Alaska,All,71780
1,2010,Alaska,All,71588
2,2011,Alaska,All,71745
3,2012,Alaska,All,74091
4,2013,Alaska,All,77415
...,...,...,...,...
545,2015,United,Pilot,185102
546,2016,United,Pilot,214090
547,2017,United,Pilot,240514
548,2018,United,Pilot,250319


In [52]:
baggage_df = pd.read_csv('Baggage_Base_Data_Cleaned.csv')

# Only display years 2009-2019
baggage_df = baggage_df[(baggage_df['YEAR'] >= 2009) & (baggage_df['YEAR'] <= 2019)]

# Group by 'YEAR', 'CARRIER', and 'REGION', and calculate the average of all numeric columns
combined_baggage_df = baggage_df.groupby(['YEAR', 'CARRIER']).sum(numeric_only=True).reset_index()

# Remove the 'MONTH' column
combined_baggage_df = combined_baggage_df.drop(columns=['MONTH'])

combined_baggage_df

Unnamed: 0,YEAR,CARRIER,MISHANDLED_BAG,TOTAL_BAG
0,2016,Alaska,8133,7348268
1,2016,American,85293,38418243
2,2016,Delta,50487,39457676
3,2016,Frontier,18159,4810390
4,2016,Hawaiian,6752,3235035
5,2016,JetBlue,11897,10551943
6,2016,Southwest,108851,50206095
7,2016,Spirit,12762,8435079
8,2016,United,53548,26115301
9,2017,Alaska,44997,32262578


In [53]:
# Read the CSV file and reset the index
delay_df = pd.read_csv('delays_cleaned_MB.csv', index_col=0)
delay_df.reset_index(inplace=True)

# Change headers to all caps
delay_df.columns = delay_df.columns.str.upper()

# Drop the 'carrier' column
delay_df.drop(columns=['CARRIER'], inplace=True)

# Rename the 'carrier_name' column to 'carrier'
delay_df = delay_df.rename(columns={'CARRIER_NAME': 'CARRIER'})

delay_df = delay_df[(delay_df['YEAR'] >= 2009) & (delay_df['YEAR'] <= 2019)]

# Display the modified DataFrame
display(delay_df)


Unnamed: 0,YEAR,MONTH,CARRIER,AIRPORT,AIRPORT_NAME,ARR_FLIGHTS,ARR_DEL15,CARRIER_CT,WEATHER_CT,NAS_CT,...,SECURITY_CT_FRAC,LATE_AIRCRAFT_CT_FRAC,WEATHER_CT_FRAC,NAS_DELAY_AVG,CARRIER_DELAY_AVG,SECURITY_DELAY_AVG,LATE_AIRCRAFT_DELAY_AVG,WEATHER_DELAY_AVG,STATE_CODE,REGION
805,2018,1,Southwest,PHX,"Phoenix, AZ: Phoenix Sky Harbor International",5067.0,1640.0,207.38,8.22,1061.83,...,0.000266,0.071287,0.001622,37.630317,51.755232,106.666667,48.564547,90.145985,AZ,West
806,2017,5,Southwest,LAX,"Los Angeles, CA: Los Angeles International",3671.0,1711.0,234.32,50.63,1030.82,...,0.000207,0.107453,0.013792,42.164490,50.734039,21.052632,58.317700,48.153269,CA,West
807,2018,2,Southwest,PHX,"Phoenix, AZ: Phoenix Sky Harbor International",4534.0,1907.0,366.00,15.67,1004.73,...,0.000986,0.113835,0.003456,41.789336,52.297814,33.109620,50.690717,93.618379,AZ,West
809,2016,10,Southwest,PHX,"Phoenix, AZ: Phoenix Sky Harbor International",4963.0,1533.0,191.84,21.19,921.51,...,0.000000,0.080286,0.004270,39.687035,48.175563,0.000000,49.194398,54.082114,AZ,West
811,2014,10,Southwest,MDW,"Chicago, IL: Chicago Midway International",7125.0,1742.0,296.49,41.53,840.73,...,0.000000,0.079054,0.005829,37.749337,52.308678,0.000000,56.004332,47.146641,IL,Midwest
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123119,2012,11,Southwest,CRP,"Corpus Christi, TX: Corpus Christi International",130.0,13.0,6.59,1.00,0.00,...,0.000000,0.041615,0.007692,0.000000,30.955994,0.000000,37.523105,23.000000,TX,South
123120,2012,10,Southwest,DAY,"Dayton, OH: James M Cox/Dayton International",31.0,1.0,1.00,0.00,0.00,...,0.000000,0.000000,0.000000,0.000000,44.000000,0.000000,0.000000,0.000000,OH,Midwest
123121,2012,9,Southwest,DSM,"Des Moines, IA: Des Moines International",2.0,0.0,0.00,0.00,0.00,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,IA,Midwest
123122,2011,12,Southwest,CHS,"Charleston, SC: Charleston AFB/International",217.0,16.0,6.87,0.00,0.00,...,0.000000,0.042074,0.000000,0.000000,48.908297,0.000000,50.273823,0.000000,SC,South


In [54]:
# Group by 'YEAR', 'CARRIER', and 'REGION', and calculate the average of all numeric columns
combined_delay_df = delay_df.groupby(['YEAR', 'CARRIER', 'REGION']).mean(numeric_only=True).reset_index()

# Remove the 'MONTH' column
combined_delay_df = combined_delay_df.drop(columns=['MONTH'])

combined_delay_df

Unnamed: 0,YEAR,CARRIER,REGION,ARR_FLIGHTS,ARR_DEL15,CARRIER_CT,WEATHER_CT,NAS_CT,SECURITY_CT,LATE_AIRCRAFT_CT,...,NAS_CT_FRAC,CARRIER_CT_FRAC,SECURITY_CT_FRAC,LATE_AIRCRAFT_CT_FRAC,WEATHER_CT_FRAC,NAS_DELAY_AVG,CARRIER_DELAY_AVG,SECURITY_DELAY_AVG,LATE_AIRCRAFT_DELAY_AVG,WEATHER_DELAY_AVG
0,2009,Alaska,Midwest,93.833333,23.208333,4.392500,0.571667,17.144583,0.049583,1.050417,...,0.178275,0.049838,0.000371,0.010948,0.005766,36.917222,46.616879,1.785375,16.630962,36.643273
1,2009,Alaska,Northeast,68.291667,14.000000,2.579583,0.135417,10.765000,0.000000,0.520417,...,0.163275,0.037775,0.000000,0.008275,0.002148,44.997065,49.745766,0.000000,17.388794,9.377493
2,2009,Alaska,South,60.783333,9.450000,2.554833,0.229833,5.715833,0.026833,0.922833,...,0.090190,0.040608,0.000488,0.013161,0.003356,24.342403,37.919719,1.491968,20.385473,12.964425
3,2009,Alaska,West,275.549894,43.280255,11.678875,1.194522,16.988811,0.183609,13.234607,...,0.056466,0.049306,0.000508,0.047745,0.007640,30.031137,50.483057,3.920992,41.440563,30.180495
4,2009,American,Midwest,791.783333,147.375000,40.529750,6.243750,57.668000,0.126083,42.808000,...,0.047314,0.081850,0.000034,0.057720,0.012164,41.846255,53.046283,5.461556,64.871274,69.656644
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
418,2019,United,Midwest,457.792035,93.942478,18.359735,3.736593,36.757389,0.010398,35.077876,...,0.054534,0.053849,0.000009,0.108022,0.011398,44.433518,61.742445,0.485771,79.204355,81.250498
419,2019,United,Northeast,594.753165,155.588608,24.630696,4.404810,81.992152,0.022975,44.538671,...,0.082636,0.058825,0.000033,0.102035,0.010353,52.113391,60.218261,2.098103,84.505184,103.573889
420,2019,United,Other,70.633333,12.783333,4.613833,0.502167,4.024333,0.000000,3.644167,...,0.053264,0.070763,0.000000,0.061060,0.006754,35.712539,62.568469,0.000000,58.611747,34.239360
421,2019,United,South,479.496000,94.741333,22.030853,4.026480,31.741147,0.024480,36.918373,...,0.054914,0.050664,0.000018,0.091886,0.010134,42.480042,60.998319,1.602660,78.675699,88.670380


In [55]:
employee_df = pd.read_csv('passenger_employee_ratio_cleaned.csv', index_col=0)

employee_df.reset_index(inplace=True)

employee_df.drop(columns=['index'], inplace=True)

employee_df.rename(columns={'OTHER': 'OTHER_EMPLOYEES'}, inplace=True)

employee_df.rename(columns={'TOTAL': 'TOTAL_EMPLOYEES'}, inplace=True)

# Create a DataFrame for Hawaiian with null values for each year
years = employee_df['YEAR'].unique()
hawaiian_df = pd.DataFrame({'YEAR': years, 'CARRIER': 'Hawaiian'})
hawaiian_df = hawaiian_df.reindex(columns=employee_df.columns)

# Concatenate the employee_df with hawaiian_df
employee_df = pd.concat([employee_df, hawaiian_df], ignore_index=True)

employee_df['CARRIER'] = employee_df['CARRIER'].replace('Jetblue', 'JetBlue')

# Sort the DataFrame by 'YEAR' and 'CARRIER'
employee_df.sort_values(by=['YEAR', 'CARRIER'], inplace=True)

display(employee_df)

Unnamed: 0,YEAR,PILOTS_CO-PILOTS,FLIGHT_ATTENDANTS,MAINTENANCE,VARIOUS_HANDLERS,OTHER_EMPLOYEES,TOTAL_EMPLOYEES,CARRIER
76,2009,12590.0,7022.0,22198.0,4594.0,11417.0,1746.0,Alaska
65,2009,16470.0,12975.0,33884.0,11083.0,34770.0,3470.0,Allegiant
0,2009,10601.0,5709.0,6819.0,3574.0,12684.0,1290.0,American
11,2009,10310.0,6038.0,12771.0,4415.0,9046.0,1463.0,Delta
54,2009,14839.0,10631.0,39697.0,3669.0,23230.0,1971.0,Frontier
...,...,...,...,...,...,...,...,...
108,2019,,,,,,,Hawaiian
53,2019,35977.0,26459.0,141689.0,19284.0,54732.0,6985.0,JetBlue
42,2019,52914.0,30735.0,191650.0,21470.0,79630.0,8378.0,Southwest
97,2019,42673.0,25262.0,229873.0,274605.0,252119.0,11282.0,Spirit


In [62]:
# For combined_df
combined_df = combined_df[['YEAR', 'CARRIER'] + 
                          [col for col in combined_df.columns if col not in ['YEAR', 'CARRIER']]]

# For wage_df
wage_df = wage_df[['YEAR', 'CARRIER'] + 
                  [col for col in wage_df.columns if col not in ['YEAR', 'CARRIER']]]

# For employee_df
employee_df = employee_df[['YEAR', 'CARRIER'] + 
                          [col for col in employee_df.columns if col not in ['YEAR', 'CARRIER']]]

# For delay_df
combined_delay_df = combined_delay_df[['YEAR', 'CARRIER'] + 
                                      [col for col in combined_delay_df.columns if col not in ['YEAR', 'CARRIER']]]

# For baggage_df
combined_baggage_df = combined_baggage_df[['YEAR', 'CARRIER'] + 
                                          [col for col in combined_baggage_df.columns if col not in ['YEAR', 'CARRIER']]]

In [57]:
# Sort combined_df by 'YEAR' in ascending order and 'CARRIER' alphabetically
combined_df = combined_df.sort_values(by=['YEAR', 'CARRIER'], ascending=[True, True])

# Sort wage_df by 'YEAR' in ascending order and 'CARRIER' alphabetically
wage_df = wage_df.sort_values(by=['YEAR', 'CARRIER'], ascending=[True, True])

# Sort employee_df by 'YEAR' in ascending order and 'CARRIER' alphabetically
employee_df = employee_df.sort_values(by=['YEAR', 'CARRIER'], ascending=[True, True])

# Sort delay_df by 'YEAR' in ascending order and 'CARRIER' alphabetically
combined_delay_df = combined_delay_df.sort_values(by=['YEAR', 'CARRIER'], ascending=[True, True])

# Sort baggage_df by 'YEAR' in ascending order and 'CARRIER' alphabetically
combined_baggage_df = combined_baggage_df.sort_values(by=['YEAR', 'CARRIER'], ascending=[True, True])

In [58]:
# Get unique CARRIER names from each dataframe and sort them alphabetically
unique_carriers_employee = sorted(set(employee_df['CARRIER'].unique()))
unique_carriers_wage = sorted(set(wage_df['CARRIER'].unique()))
unique_carriers_combined = sorted(set(combined_df['CARRIER'].unique()))
unique_carriers_delay = sorted(set(combined_delay_df['CARRIER'].unique()))
unique_carriers_baggage = sorted(set(combined_baggage_df['CARRIER'].unique()))

# Check if the unique carrier names match across all dataframes
if (unique_carriers_delay == unique_carriers_employee == unique_carriers_wage == unique_carriers_combined == unique_carriers_baggage):
    print("All CARRIER names match across the dataframes.")
else:
    print("CARRIER names do not match across the dataframes.")

print()
print()
# Find the discrepancies
carrier_names = {
    'employee_df': unique_carriers_employee,
    'wage_df': unique_carriers_wage,
    'combined_df': unique_carriers_combined,
    'combined_delay_df': unique_carriers_delay,
    'combined_baggage_df': unique_carriers_baggage
}

# Print the discrepancies
for dataframe, carriers in carrier_names.items():
    print(f"Unique CARRIER names in {dataframe}: {carriers}")
print()
print()
# Find the differences in carrier names between dataframes
print("Unique carrier names in employee_df but not in other dataframes:", set(unique_carriers_employee) - (set(unique_carriers_delay) | set(unique_carriers_wage) | set(unique_carriers_combined) | set(unique_carriers_baggage)))
print("Unique carrier names in wage_df but not in other dataframes:", set(unique_carriers_wage) - (set(unique_carriers_delay) | set(unique_carriers_employee) | set(unique_carriers_combined) | set(unique_carriers_baggage)))
print("Unique carrier names in combined_df but not in other dataframes:", set(unique_carriers_combined) - (set(unique_carriers_delay) | set(unique_carriers_employee) | set(unique_carriers_wage) | set(unique_carriers_baggage)))
print("Unique carrier names in combined_delay_df but not in other dataframes:", set(unique_carriers_delay) - (set(unique_carriers_employee) | set(unique_carriers_wage) | set(unique_carriers_combined) | set(unique_carriers_baggage)))
print("Unique carrier names in combined_baggage_df but not in other dataframes:", set(unique_carriers_baggage) - (set(unique_carriers_delay) | set(unique_carriers_employee) | set(unique_carriers_wage) | set(unique_carriers_combined)))


All CARRIER names match across the dataframes.


Unique CARRIER names in employee_df: ['Alaska', 'Allegiant', 'American', 'Delta', 'Frontier', 'Hawaiian', 'JetBlue', 'Southwest', 'Spirit', 'United']
Unique CARRIER names in wage_df: ['Alaska', 'Allegiant', 'American', 'Delta', 'Frontier', 'Hawaiian', 'JetBlue', 'Southwest', 'Spirit', 'United']
Unique CARRIER names in combined_df: ['Alaska', 'Allegiant', 'American', 'Delta', 'Frontier', 'Hawaiian', 'JetBlue', 'Southwest', 'Spirit', 'United']
Unique CARRIER names in combined_delay_df: ['Alaska', 'Allegiant', 'American', 'Delta', 'Frontier', 'Hawaiian', 'JetBlue', 'Southwest', 'Spirit', 'United']
Unique CARRIER names in combined_baggage_df: ['Alaska', 'Allegiant', 'American', 'Delta', 'Frontier', 'Hawaiian', 'JetBlue', 'Southwest', 'Spirit', 'United']


Unique carrier names in employee_df but not in other dataframes: set()
Unique carrier names in wage_df but not in other dataframes: set()
Unique carrier names in combined_df but not in othe

In [59]:
# Merge wage_df and employee_df DataFrames on 'YEAR' and 'CARRIER' columns using outer join
merged_df = pd.merge(wage_df, employee_df, on=['YEAR', 'CARRIER'], how='outer')

# Sort merged_df by 'YEAR' and 'CARRIER' columns in ascending order
merged_df = merged_df.sort_values(by=['YEAR', 'CARRIER'], ascending=[True, True])

# Merge combined_df DataFrame with merged_df on 'YEAR' and 'CARRIER' columns using outer join
more_merged_df = pd.merge(merged_df, combined_df, on=['YEAR', 'CARRIER'], how='outer')

# Sort more_merged_df by 'YEAR' and 'CARRIER' columns in ascending order
more_merged_df = more_merged_df.sort_values(by=['YEAR', 'CARRIER'], ascending=[True, True])

# Merge combined_delay_df DataFrame with more_merged_df on 'YEAR' and 'CARRIER' columns using outer join
even_more_merged_df = pd.merge(more_merged_df, combined_delay_df, on=['YEAR', 'CARRIER'], how='outer')

# Sort even_more_merged_df by 'YEAR' and 'CARRIER' columns in ascending order
even_more_merged_df = even_more_merged_df.sort_values(by=['YEAR', 'CARRIER'], ascending=[True, True])

# Merge combined_baggage_df DataFrame with even_more_merged_df on 'YEAR' and 'CARRIER' columns using outer join
final_merge_df = pd.merge(even_more_merged_df, combined_baggage_df, on=['YEAR', 'CARRIER'], how='outer')

# Sort final_merge_df by 'YEAR' and 'CARRIER' columns in ascending order
final_merge_df = final_merge_df.sort_values(by=['YEAR', 'CARRIER'], ascending=[True, True])

# Save the modified DataFrame to a new CSV file named 'omni_data.csv' without including the index
final_merge_df.to_csv('omni_data.csv', index=False)


In [60]:
final_merge_df

Unnamed: 0,YEAR,CARRIER,WAGE_CAT,PAY,PILOTS_CO-PILOTS,FLIGHT_ATTENDANTS,MAINTENANCE,VARIOUS_HANDLERS,OTHER_EMPLOYEES,TOTAL_EMPLOYEES,...,SECURITY_CT_FRAC,LATE_AIRCRAFT_CT_FRAC,WEATHER_CT_FRAC,NAS_DELAY_AVG,CARRIER_DELAY_AVG,SECURITY_DELAY_AVG,LATE_AIRCRAFT_DELAY_AVG,WEATHER_DELAY_AVG,MISHANDLED_BAG,TOTAL_BAG
0,2009,Alaska,All,71780,12590.0,7022.0,22198.0,4594.0,11417.0,1746.0,...,0.000371,0.010948,0.005766,36.917222,46.616879,1.785375,16.630962,36.643273,,
1,2009,Alaska,All,71780,12590.0,7022.0,22198.0,4594.0,11417.0,1746.0,...,0.000000,0.008275,0.002148,44.997065,49.745766,0.000000,17.388794,9.377493,,
2,2009,Alaska,All,71780,12590.0,7022.0,22198.0,4594.0,11417.0,1746.0,...,0.000488,0.013161,0.003356,24.342403,37.919719,1.491968,20.385473,12.964425,,
3,2009,Alaska,All,71780,12590.0,7022.0,22198.0,4594.0,11417.0,1746.0,...,0.000508,0.047745,0.007640,30.031137,50.483057,3.920992,41.440563,30.180495,,
4,2009,Alaska,Attendant,35007,12590.0,7022.0,22198.0,4594.0,11417.0,1746.0,...,0.000371,0.010948,0.005766,36.917222,46.616879,1.785375,16.630962,36.643273,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2185,2019,United,Pilot,257376,29817.0,14945.0,45626.0,8444.0,64582.0,3874.0,...,0.000009,0.108022,0.011398,44.433518,61.742445,0.485771,79.204355,81.250498,544338.0,80044379.0
2186,2019,United,Pilot,257376,29817.0,14945.0,45626.0,8444.0,64582.0,3874.0,...,0.000033,0.102035,0.010353,52.113391,60.218261,2.098103,84.505184,103.573889,544338.0,80044379.0
2187,2019,United,Pilot,257376,29817.0,14945.0,45626.0,8444.0,64582.0,3874.0,...,0.000000,0.061060,0.006754,35.712539,62.568469,0.000000,58.611747,34.239360,544338.0,80044379.0
2188,2019,United,Pilot,257376,29817.0,14945.0,45626.0,8444.0,64582.0,3874.0,...,0.000018,0.091886,0.010134,42.480042,60.998319,1.602660,78.675699,88.670380,544338.0,80044379.0


In [61]:
final_merge_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2190 entries, 0 to 2189
Data columns (total 46 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   YEAR                     2190 non-null   int64  
 1   CARRIER                  2190 non-null   object 
 2   WAGE_CAT                 2190 non-null   object 
 3   PAY                      2190 non-null   int64  
 4   PILOTS_CO-PILOTS         2035 non-null   float64
 5   FLIGHT_ATTENDANTS        2035 non-null   float64
 6   MAINTENANCE              2035 non-null   float64
 7   VARIOUS_HANDLERS         2035 non-null   float64
 8   OTHER_EMPLOYEES          2035 non-null   float64
 9   TOTAL_EMPLOYEES          2035 non-null   float64
 10  NET_INCOME               2190 non-null   float64
 11  TOTAL_PASSENGERS         2190 non-null   int64  
 12  TOTAL_RPMS               2190 non-null   float64
 13  REGION                   2115 non-null   object 
 14  ARR_FLIGHTS             

In [63]:
wage_df = pd.read_csv('modified_wage_data.csv')

employee_df = pd.read_csv('passenger_employee_ratio_cleaned.csv', index_col=0)

delay_df = pd.read_csv('delays_cleaned_MB.csv', index_col=0)

baggage_df = pd.read_csv('Baggage_Base_Data_Cleaned.csv')


