In [4]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import regex as re
import json

In [85]:
airfares = pd.read_csv("ConsumerAirfares.csv")
airfares.info()
airfares.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118035 entries, 0 to 118034
Data columns (total 26 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Year                      118035 non-null  int64  
 1   quarter                   118035 non-null  int64  
 2   citymarketid_1            118035 non-null  object 
 3   citymarketid_2            118035 non-null  int64  
 4   city1                     118035 non-null  object 
 5   city2                     118035 non-null  object 
 6   nsmiles                   118035 non-null  int64  
 7   passengers                118035 non-null  object 
 8   fare                      118035 non-null  object 
 9   carrier_lg                118035 non-null  object 
 10  large_ms                  118035 non-null  float64
 11  fare_lg                   118035 non-null  object 
 12  carrier_low               118032 non-null  object 
 13  lf_ms                     118032 non-null  f

Unnamed: 0,Year,quarter,citymarketid_1,citymarketid_2,city1,city2,nsmiles,passengers,fare,carrier_lg,...,Geocoded_City1,Geocoded_City1 (address),Geocoded_City1 (city),Geocoded_City1 (state),Geocoded_City1 (zip),Geocoded_City2,Geocoded_City2 (address),Geocoded_City2 (city),Geocoded_City2 (state),Geocoded_City2 (zip)
0,2025,2,32467,31703,"Miami, FL (Metropolitan Area)","New York City, NY (Metropolitan Area)",1118,17955,$208.52,B6,...,,,,,,,,,,
1,2025,2,32575,32457,"Los Angeles, CA (Metropolitan Area)","San Francisco, CA (Metropolitan Area)",372,17310,$157.68,WN,...,,,,,,,,,,
2,2025,2,32575,31703,"Los Angeles, CA (Metropolitan Area)","New York City, NY (Metropolitan Area)",2510,13648,$430.38,DL,...,,,,,,,,,,
3,2025,2,31703,31454,"New York City, NY (Metropolitan Area)","Orlando, FL",989,12627,$186.50,B6,...,,,,,,,,,,
4,2025,2,30977,31703,"Chicago, IL","New York City, NY (Metropolitan Area)",773,11284,$221.33,UA,...,,,,,,,,,,


In [86]:
# Drops columns that are duplicates or unnecessary
airfares = airfares.drop(["table_1_flag","Geocoded_City1","Geocoded_City2","Geocoded_City1 (city)","Geocoded_City2 (city)"],axis=1)

### Inconsistency: Fill necessary columns with missing values that have NaN values with 0
airfares["lf_ms"] = airfares["lf_ms"].fillna(0)
airfares["carrier_low"] = airfares["carrier_low"].fillna(0)
airfares["fare_low"] = airfares["fare_low"].fillna(0)
airfares = airfares.dropna(axis=1)

### Inconsistency: citymarketid_1 was a object with commas whereas citymarketid_2 was a column of ints, changed citymarketid_1 to be column of ints
airfares["citymarketid_1"] = airfares["citymarketid_1"].str.replace(',','').astype(int)
airfares.info()
airfares

    

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118035 entries, 0 to 118034
Data columns (total 15 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Year            118035 non-null  int64  
 1   quarter         118035 non-null  int64  
 2   citymarketid_1  118035 non-null  int64  
 3   citymarketid_2  118035 non-null  int64  
 4   city1           118035 non-null  object 
 5   city2           118035 non-null  object 
 6   nsmiles         118035 non-null  int64  
 7   passengers      118035 non-null  object 
 8   fare            118035 non-null  object 
 9   carrier_lg      118035 non-null  object 
 10  large_ms        118035 non-null  float64
 11  fare_lg         118035 non-null  object 
 12  carrier_low     118035 non-null  object 
 13  lf_ms           118035 non-null  float64
 14  fare_low        118035 non-null  object 
dtypes: float64(2), int64(5), object(8)
memory usage: 13.5+ MB


Unnamed: 0,Year,quarter,citymarketid_1,citymarketid_2,city1,city2,nsmiles,passengers,fare,carrier_lg,large_ms,fare_lg,carrier_low,lf_ms,fare_low
0,2025,2,32467,31703,"Miami, FL (Metropolitan Area)","New York City, NY (Metropolitan Area)",1118,17955,$208.52,B6,0.2551,$191.48,B6,0.2551,$191.48
1,2025,2,32575,32457,"Los Angeles, CA (Metropolitan Area)","San Francisco, CA (Metropolitan Area)",372,17310,$157.68,WN,0.5006,$169.03,AS,0.1193,$140.59
2,2025,2,32575,31703,"Los Angeles, CA (Metropolitan Area)","New York City, NY (Metropolitan Area)",2510,13648,$430.38,DL,0.2535,$526.21,B6,0.2272,$365.63
3,2025,2,31703,31454,"New York City, NY (Metropolitan Area)","Orlando, FL",989,12627,$186.50,B6,0.3735,$186.10,B6,0.3735,$186.10
4,2025,2,30977,31703,"Chicago, IL","New York City, NY (Metropolitan Area)",773,11284,$221.33,UA,0.4328,$238.62,AA,0.2426,$217.36
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118030,1996,1,33495,31454,"New Orleans, LA","Orlando, FL",550,111,$138.88,DL,0.2600,$174.49,NW,0.1400,$112.40
118031,1996,1,30647,31995,"Cleveland, OH (Metropolitan Area)","Greensboro/High Point, NC",381,111,$199.92,CO,0.8400,$193.76,CO,0.8400,$193.76
118032,1996,1,30158,31454,"Atlantic City, NJ","Orlando, FL",852,111,$95.23,NK,0.9300,$91.49,NK,0.9300,$91.49
118033,1996,1,33244,33495,"Memphis, TN","New Orleans, LA",349,110,$201.52,NW,0.8800,$204.78,J7,0.0400,$121.56


In [103]:
### Somewhat inconsistencies: changed passengers, fare, fare_large, fare_low to be integers or floats rounded to two decimal points to make calculations easier in the future
airfares["passengers"] = airfares["passengers"].astype(str).str.replace(',', '').astype(int)
airfares["fare"] = round(airfares["fare"].astype(str).str.replace('$','').astype(float),2)
airfares["fare_lg"] = round(airfares["fare_lg"].astype(str).str.replace('$','').astype(float),2)
airfares["fare_low"] = round(airfares["fare_low"].astype(str).str.replace('$','').astype(float),2)

In [104]:
airfares

Unnamed: 0,Year,quarter,citymarketid_1,citymarketid_2,city1,city2,nsmiles,passengers,fare,carrier_lg,large_ms,fare_lg,carrier_low,lf_ms,fare_low
0,2025,2,32467,31703,"Miami, FL (Metropolitan Area)","New York City, NY (Metropolitan Area)",1118,17955,208.52,B6,0.2551,191.48,B6,0.2551,191.48
1,2025,2,32575,32457,"Los Angeles, CA (Metropolitan Area)","San Francisco, CA (Metropolitan Area)",372,17310,157.68,WN,0.5006,169.03,AS,0.1193,140.59
2,2025,2,32575,31703,"Los Angeles, CA (Metropolitan Area)","New York City, NY (Metropolitan Area)",2510,13648,430.38,DL,0.2535,526.21,B6,0.2272,365.63
3,2025,2,31703,31454,"New York City, NY (Metropolitan Area)","Orlando, FL",989,12627,186.50,B6,0.3735,186.10,B6,0.3735,186.10
4,2025,2,30977,31703,"Chicago, IL","New York City, NY (Metropolitan Area)",773,11284,221.33,UA,0.4328,238.62,AA,0.2426,217.36
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118030,1996,1,33495,31454,"New Orleans, LA","Orlando, FL",550,111,138.88,DL,0.2600,174.49,NW,0.1400,112.40
118031,1996,1,30647,31995,"Cleveland, OH (Metropolitan Area)","Greensboro/High Point, NC",381,111,199.92,CO,0.8400,193.76,CO,0.8400,193.76
118032,1996,1,30158,31454,"Atlantic City, NJ","Orlando, FL",852,111,95.23,NK,0.9300,91.49,NK,0.9300,91.49
118033,1996,1,33244,33495,"Memphis, TN","New Orleans, LA",349,110,201.52,NW,0.8800,204.78,J7,0.0400,121.56
