In [11]:
# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import kagglehub
import pandas as pd
import os

# Check if data already exists locally first
local_data_path = "./data/"
airline_file = "airline.csv.shuffle"
carriers_file = "carriers.csv"

if os.path.exists(local_data_path):
    local_files = os.listdir(local_data_path)
    print(f"Files found in local data folder: {local_files}")
    
    # Check if both required files exist locally
    airline_exists = airline_file in local_files
    carriers_exists = carriers_file in local_files
    
    if airline_exists and carriers_exists:
        print("Both files found locally, using local data...")
        path = local_data_path
        airline_file_path = os.path.join(path, airline_file)
        carriers_file_path = os.path.join(path, carriers_file)
    else:
        missing_files = []
        if not airline_exists:
            missing_files.append(airline_file)
        if not carriers_exists:
            missing_files.append(carriers_file)
        print(f"Missing local files: {missing_files}, downloading from Kaggle...")
        
        # Download the dataset from Kaggle
        path = kagglehub.dataset_download("bulter22/airline-data")
        print(f"Dataset downloaded to: {path}")
        airline_file_path = os.path.join(path, airline_file)
        carriers_file_path = os.path.join(path, carriers_file)
else:
    print("Local data folder doesn't exist, downloading from Kaggle...")
    # Download the dataset from Kaggle
    path = kagglehub.dataset_download("bulter22/airline-data")
    print(f"Dataset downloaded to: {path}")
    airline_file_path = os.path.join(path, airline_file)
    carriers_file_path = os.path.join(path, carriers_file)

print(f"Dataset path: {path}")
print(f"Airline file: {airline_file_path}")
print(f"Carriers file: {carriers_file_path}")

# Check file sizes
if os.path.exists(airline_file_path):
    airline_size = os.path.getsize(airline_file_path)
    print(f"Airline file size: {airline_size / (1024*1024):.2f} MB")
    
    # Read first few lines of airline file
    with open(airline_file_path, 'r') as f:
        airline_first_lines = [f.readline().strip() for _ in range(3)]
    print("First 3 lines of airline.csv.shuffle:")
    for i, line in enumerate(airline_first_lines, 1):
        print(f"Line {i}: {line}")

if os.path.exists(carriers_file_path):
    carriers_size = os.path.getsize(carriers_file_path)
    print(f"Carriers file size: {carriers_size / (1024*1024):.2f} MB")
    
    # Read first few lines of carriers file
    with open(carriers_file_path, 'r') as f:
        carriers_first_lines = [f.readline().strip() for _ in range(5)]
    print("First 5 lines of carriers.csv:")
    for i, line in enumerate(carriers_first_lines, 1):
        print(f"Line {i}: {line}")

Files found in local data folder: ['airline.csv.shuffle', 'carriers.csv']
Both files found locally, using local data...
Dataset path: ./data/
Airline file: ./data/airline.csv.shuffle
Carriers file: ./data/carriers.csv
Airline file size: 11471.95 MB
First 3 lines of airline.csv.shuffle:
Line 1: ActualElapsedTime,AirTime,ArrDelay,ArrTime,CRSArrTime,CRSDepTime,CRSElapsedTime,CancellationCode,Cancelled,CarrierDelay,DayOfWeek,DayofMonth,DepDelay,DepTime,Dest,Distance,Diverted,FlightNum,LateAircraftDelay,Month,NASDelay,Origin,SecurityDelay,TailNum,TaxiIn,TaxiOut,UniqueCarrier,WeatherDelay,Year
Line 2: 53,32,-8,1642,1650,1545,65,NA,0,NA,4,10,4,1549,PIT,205,0,209,NA,10,NA,DCA,NA,N443US,7,14,US,NA,2002
Line 3: 164,155,-11,1754,1805,1610,175,NA,0,NA,4,2,0,1610,MCI,1072,0,109,NA,12,NA,MCO,NA,N755,2,7,WN,NA,1999
Carriers file size: 0.04 MB
First 5 lines of carriers.csv:
Line 1: Code,Description
Line 2: "02Q","Titan Airways"
Line 3: "04Q","Tradewind Aviation"
Line 4: "05Q","Comlux Aviation, AG"
Lin

In [13]:
# Load both datasets with smart sampling for large files
import psutil
import random
import numpy as np

print("=== LOADING DATASETS WITH SMART SAMPLING ===")

def get_memory_info():
    """Get current memory usage information"""
    memory = psutil.virtual_memory()
    return {
        'total_gb': memory.total / (1024**3),
        'available_gb': memory.available / (1024**3),
        'used_gb': memory.used / (1024**3),
        'percent': memory.percent
    }

def smart_load_csv(file_path, dataset_name, sample_size='auto'):
    """
    Smart CSV loader that handles large files with sampling
    
    sample_size options:
    - 'auto': Automatically choose based on file size
    - 'quick': 10,000 rows for quick testing
    - 'dev': 100,000 rows for development
    - 'analysis': 500,000 rows for analysis
    - integer: specific number of rows
    - 'full': load entire file (use with caution)
    """
    
    # Check file size
    file_size_bytes = os.path.getsize(file_path)
    file_size_gb = file_size_bytes / (1024**3)
    
    print(f"\n{dataset_name} file size: {file_size_gb:.2f} GB")
    
    # Get memory info
    mem_info = get_memory_info()
    print(f"Available memory: {mem_info['available_gb']:.2f} GB")
    
    # Determine sample size
    if sample_size == 'auto':
        if file_size_gb > 1.0:  # If file > 1GB, use sampling
            if file_size_gb > 10.0:
                sample_size = 'analysis'  # 500K rows for very large files
            else:
                sample_size = 'dev'  # 100K rows for moderately large files
        else:
            sample_size = 'full'
    
    # Convert sample size to number
    size_map = {
        'quick': 10000,
        'dev': 100000, 
        'analysis': 500000
    }
    
    if isinstance(sample_size, str) and sample_size in size_map:
        n_rows = size_map[sample_size]
        print(f"Using {sample_size} sampling: {n_rows:,} rows")
    elif isinstance(sample_size, int):
        n_rows = sample_size
        print(f"Using custom sampling: {n_rows:,} rows")
    elif sample_size == 'full':
        n_rows = None
        print("Loading full dataset...")
    else:
        n_rows = 100000  # Default fallback
        print(f"Using default sampling: {n_rows:,} rows")
    
    # Load with encoding handling
    # encodings = ['utf-8', 'latin-1', 'iso-8859-1', 'cp1252']
    encodings = ['latin-1']
    
    for encoding in encodings:
        try:
            if n_rows is None:
                # Load full file
                print(f"Trying {encoding} encoding...")
                df = pd.read_csv(file_path, encoding=encoding)
            else:
                # Sample approach: read first n_rows for now (can be enhanced to random sampling)
                print(f"Trying {encoding} encoding with {n_rows:,} rows...")
                df = pd.read_csv(file_path, encoding=encoding, nrows=n_rows)
                
            print(f"✓ {dataset_name} loaded successfully with {encoding}: {df.shape}")
            print(f"Columns: {list(df.columns)}")
            
            # Memory usage of loaded dataframe
            df_memory_mb = df.memory_usage(deep=True).sum() / (1024**2)
            print(f"DataFrame memory usage: {df_memory_mb:.2f} MB")
            
            return df
            
        except UnicodeDecodeError:
            print(f"  {encoding} failed due to encoding issues")
            continue
        except Exception as e:
            print(f"  {encoding} failed: {str(e)[:100]}...")
            continue
    
    # Final fallback with error replacement
    try:
        print(f"Trying utf-8 with error handling...")
        if n_rows is None:
            df = pd.read_csv(file_path, encoding='utf-8', encoding_errors='replace')
        else:
            df = pd.read_csv(file_path, encoding='utf-8', encoding_errors='replace', nrows=n_rows)
        print(f"✓ {dataset_name} loaded with error replacement: {df.shape}")
        return df
    except Exception as e:
        print(f"All methods failed for {dataset_name}: {e}")
        return None

# Show initial memory state
print("=== INITIAL MEMORY STATE ===")
initial_memory = get_memory_info()
print(f"Total RAM: {initial_memory['total_gb']:.2f} GB")
print(f"Available: {initial_memory['available_gb']:.2f} GB") 
print(f"Used: {initial_memory['used_gb']:.2f} GB ({initial_memory['percent']:.1f}%)")

# Load airline data with smart sampling
df_airline = smart_load_csv(airline_file_path, "Airline data", sample_size='auto')

# Load carriers data (usually smaller, so can load fully)
df_carriers = smart_load_csv(carriers_file_path, "Carriers data", sample_size='full')

# Show final memory state
print("\n=== MEMORY STATE AFTER LOADING ===")
final_memory = get_memory_info()
print(f"Available: {final_memory['available_gb']:.2f} GB")
print(f"Used: {final_memory['used_gb']:.2f} GB ({final_memory['percent']:.1f}%)")
memory_used = final_memory['used_gb'] - initial_memory['used_gb']
print(f"Additional memory used: {memory_used:.2f} GB")

# Display sample data
if df_airline is not None:
    print(f"\n=== AIRLINE DATA SAMPLE (Shape: {df_airline.shape}) ===")
    print(df_airline.head())
    print(f"\nData types:")
    print(df_airline.dtypes)

if df_carriers is not None:
    print(f"\n=== CARRIERS DATA SAMPLE (Shape: {df_carriers.shape}) ===")
    print(df_carriers.head())
    print(f"\nData types:")
    print(df_carriers.dtypes)

=== LOADING DATASETS WITH SMART SAMPLING ===
=== INITIAL MEMORY STATE ===
Total RAM: 14.77 GB
Available: 7.79 GB
Used: 6.98 GB (47.2%)

Airline data file size: 11.20 GB
Available memory: 7.79 GB
Using analysis sampling: 500,000 rows
Trying latin-1 encoding with 500,000 rows...
✓ Airline data loaded successfully with latin-1: (500000, 29)
Columns: ['ActualElapsedTime', 'AirTime', 'ArrDelay', 'ArrTime', 'CRSArrTime', 'CRSDepTime', 'CRSElapsedTime', 'CancellationCode', 'Cancelled', 'CarrierDelay', 'DayOfWeek', 'DayofMonth', 'DepDelay', 'DepTime', 'Dest', 'Distance', 'Diverted', 'FlightNum', 'LateAircraftDelay', 'Month', 'NASDelay', 'Origin', 'SecurityDelay', 'TailNum', 'TaxiIn', 'TaxiOut', 'UniqueCarrier', 'WeatherDelay', 'Year']
DataFrame memory usage: 203.77 MB

Carriers data file size: 0.00 GB
Available memory: 7.79 GB
Loading full dataset...
Trying latin-1 encoding...
✓ Carriers data loaded successfully with latin-1: (1491, 2)
Columns: ['Code', 'Description']
DataFrame memory usage: 0

In [None]:
# Optional: Load different sample sizes for testing
print("=== SAMPLE SIZE OPTIONS ===")
print("You can reload the airline data with different sample sizes:")
print("- smart_load_csv(airline_file_path, 'Airline data', 'quick')     # 10K rows")
print("- smart_load_csv(airline_file_path, 'Airline data', 'dev')       # 100K rows") 
print("- smart_load_csv(airline_file_path, 'Airline data', 'analysis')  # 500K rows")
print("- smart_load_csv(airline_file_path, 'Airline data', 50000)       # Custom: 50K rows")

# Example: Load a quick sample for rapid testing
print("\n=== LOADING QUICK SAMPLE FOR TESTING ===")
df_airline_quick = smart_load_csv(airline_file_path, "Airline data (Quick)", 'quick')

if df_airline_quick is not None:
    print(f"\nQuick sample summary:")
    print(f"Shape: {df_airline_quick.shape}")
    print(f"Columns: {list(df_airline_quick.columns)}")
    print(f"Memory usage: {df_airline_quick.memory_usage(deep=True).sum() / (1024**2):.2f} MB")
    
    # Show data quality info
    print(f"\nData quality check:")
    print(f"Missing values: {df_airline_quick.isnull().sum().sum()}")
    print(f"Duplicate rows: {df_airline_quick.duplicated().sum()}")
    
    # Show sample of data
    print(f"\nFirst few rows:")
    print(df_airline_quick.head(3))

In [16]:
# Data Analysis for both datasets
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def analyze_dataset(df, dataset_name):
    print(f"=== {dataset_name.upper()} ANALYSIS ===")
    print(f"Dataset shape: {df.shape}")
    print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    
    print(f"\n=== {dataset_name.upper()} COLUMN INFORMATION ===")
    print(df.info())
    
    print(f"\n=== {dataset_name.upper()} DATA TYPES ===")
    print(df.dtypes.value_counts())
    
    print(f"\n=== {dataset_name.upper()} MISSING VALUES ===")
    missing_values = df.isnull().sum()
    missing_percent = (missing_values / len(df)) * 100
    missing_df = pd.DataFrame({
        'Missing Count': missing_values,
        'Missing Percentage': missing_percent
    }).sort_values('Missing Percentage', ascending=False)
    missing_summary = missing_df[missing_df['Missing Count'] > 0]
    if len(missing_summary) > 0:
        print(missing_summary)
    else:
        print("No missing values found!")
    
    print(f"\n=== {dataset_name.upper()} DESCRIPTIVE STATISTICS ===")
    print(df.describe())
    
    # Show unique values for categorical columns (if not too many)
    categorical_cols = df.select_dtypes(include=['object']).columns
    if len(categorical_cols) > 0:
        print(f"\n=== {dataset_name.upper()} CATEGORICAL COLUMNS ===")
        for col in categorical_cols:
            unique_count = df[col].nunique()
            print(f"'{col}': {unique_count} unique values")
            if unique_count <= 20:  # Show unique values if not too many
                print(f"  Values: {list(df[col].unique())}")
    
    print("\n" + "="*50 + "\n")

# Analyze airline dataset
if df_airline is not None:
    analyze_dataset(df_airline, "AIRLINE")

# Analyze carriers dataset  
if df_carriers is not None:
    analyze_dataset(df_carriers, "CARRIERS")

# Check for potential relationship between datasets
if df_airline is not None and df_carriers is not None:
    print("=== DATASET RELATIONSHIP ANALYSIS ===")
    
    # Look for common columns
    airline_cols = set(df_airline.columns)
    carriers_cols = set(df_carriers.columns)
    common_cols = airline_cols.intersection(carriers_cols)
    
    print(f"Common columns: {list(common_cols)}")
    
    # Check if carriers data can be used as lookup table
    if 'Code' in df_carriers.columns or 'code' in df_carriers.columns:
        carrier_code_col = 'Code' if 'Code' in df_carriers.columns else 'code'
        print(f"Carriers has '{carrier_code_col}' column - can be used as lookup table")
        
        # Check if airline data has corresponding carrier codes
        potential_carrier_cols = [col for col in df_airline.columns if 'carrier' in col.lower() or 'code' in col.lower()]
        if potential_carrier_cols:
            print(f"Potential carrier code columns in airline data: {potential_carrier_cols}")
    
    print(f"Airline dataset: {df_airline.shape[0]:,} rows")
    print(f"Carriers dataset: {df_carriers.shape[0]:,} rows")

=== AIRLINE ANALYSIS ===
Dataset shape: (500000, 29)
Memory usage: 203.77 MB

=== AIRLINE COLUMN INFORMATION ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 29 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   ActualElapsedTime  489361 non-null  float64
 1   AirTime            340591 non-null  float64
 2   ArrDelay           489361 non-null  float64
 3   ArrTime            489370 non-null  float64
 4   CRSArrTime         500000 non-null  int64  
 5   CRSDepTime         500000 non-null  int64  
 6   CRSElapsedTime     499894 non-null  float64
 7   CancellationCode   2948 non-null    object 
 8   Cancelled          500000 non-null  int64  
 9   CarrierDelay       138091 non-null  float64
 10  DayOfWeek          500000 non-null  int64  
 11  DayofMonth         500000 non-null  int64  
 12  DepDelay           490539 non-null  float64
 13  DepTime            490539 non-null  

In [17]:
# Data Cleaning for both datasets
print("=== DATA CLEANING ===")

def clean_dataset(df, dataset_name):
    print(f"\n=== CLEANING {dataset_name.upper()} DATASET ===")
    
    # Create a copy for cleaning
    df_clean = df.copy()
    original_shape = df_clean.shape
    
    print(f"Starting with {original_shape[0]} rows and {original_shape[1]} columns")
    
    # 1. Remove completely empty rows
    empty_rows_before = df_clean.isnull().all(axis=1).sum()
    df_clean = df_clean.dropna(how='all')
    print(f"Removed {empty_rows_before} completely empty rows")
    
    # 2. Remove duplicate rows
    duplicates_before = df_clean.duplicated().sum()
    df_clean = df_clean.drop_duplicates()
    print(f"Removed {duplicates_before} duplicate rows")
    
    # 3. Handle missing values for each column type
    print(f"\n=== HANDLING MISSING VALUES FOR {dataset_name.upper()} ===")
    
    # Identify column types
    numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
    categorical_cols = df_clean.select_dtypes(include=['object']).columns
    datetime_cols = df_clean.select_dtypes(include=['datetime64']).columns
    
    print(f"Numeric columns: {len(numeric_cols)}")
    print(f"Categorical columns: {len(categorical_cols)}")
    print(f"Datetime columns: {len(datetime_cols)}")
    
    # Handle missing values in numeric columns
    for col in numeric_cols:
        missing_count = df_clean[col].isnull().sum()
        if missing_count > 0:
            # Fill with median for numeric columns
            median_val = df_clean[col].median()
            df_clean[col].fillna(median_val, inplace=True)
            print(f"Filled {missing_count} missing values in '{col}' with median: {median_val}")
    
    # Handle missing values in categorical columns
    for col in categorical_cols:
        missing_count = df_clean[col].isnull().sum()
        if missing_count > 0:
            # Fill with mode (most frequent value) for categorical columns
            mode_val = df_clean[col].mode()
            if len(mode_val) > 0:
                df_clean[col].fillna(mode_val[0], inplace=True)
                print(f"Filled {missing_count} missing values in '{col}' with mode: '{mode_val[0]}'")
            else:
                df_clean[col].fillna('Unknown', inplace=True)
                print(f"Filled {missing_count} missing values in '{col}' with 'Unknown'")
    
    # 4. Handle outliers (for numeric columns)
    print(f"\n=== OUTLIER DETECTION FOR {dataset_name.upper()} ===")
    for col in numeric_cols:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        outliers = ((df_clean[col] < lower_bound) | (df_clean[col] > upper_bound)).sum()
        if outliers > 0:
            print(f"Column '{col}': {outliers} outliers detected (bounds: {lower_bound:.2f} to {upper_bound:.2f})")
    
    # 5. Final summary
    final_shape = df_clean.shape
    print(f"\n=== {dataset_name.upper()} CLEANING SUMMARY ===")
    print(f"Original shape: {original_shape}")
    print(f"Final shape: {final_shape}")
    print(f"Rows removed: {original_shape[0] - final_shape[0]}")
    print(f"Remaining missing values: {df_clean.isnull().sum().sum()}")
    
    return df_clean

# Clean both datasets
if df_airline is not None:
    df_airline_clean = clean_dataset(df_airline, "AIRLINE")
    print(f"Clean airline dataset available as 'df_airline_clean'")
else:
    print("Airline dataset not available for cleaning")

if df_carriers is not None:
    df_carriers_clean = clean_dataset(df_carriers, "CARRIERS")
    print(f"Clean carriers dataset available as 'df_carriers_clean'")
else:
    print("Carriers dataset not available for cleaning")

print(f"\n=== OVERALL CLEANING COMPLETED ===")
if df_airline is not None and df_carriers is not None:
    print(f"Both datasets cleaned successfully!")
    print(f"- df_airline_clean: {df_airline_clean.shape}")
    print(f"- df_carriers_clean: {df_carriers_clean.shape}")
elif df_airline is not None:
    print(f"Airline dataset cleaned: {df_airline_clean.shape}")
elif df_carriers is not None:
    print(f"Carriers dataset cleaned: {df_carriers_clean.shape}")
else:
    print("No datasets were available for cleaning")

=== DATA CLEANING ===

=== CLEANING AIRLINE DATASET ===
Starting with 500000 rows and 29 columns
Removed 0 completely empty rows
Removed 0 duplicate rows

=== HANDLING MISSING VALUES FOR AIRLINE ===
Numeric columns: 24
Categorical columns: 5
Datetime columns: 0
Filled 10639 missing values in 'ActualElapsedTime' with median: 101.0
Filled 159409 missing values in 'AirTime' with median: 84.0
Filled 10639 missing values in 'ArrDelay' with median: 0.0
Filled 10630 missing values in 'ArrTime' with median: 1522.0
Filled 106 missing values in 'CRSElapsedTime' with median: 102.0
Filled 361909 missing values in 'CarrierDelay' with median: 0.0
Filled 9461 missing values in 'DepDelay' with median: 0.0
Filled 9461 missing values in 'DepTime' with median: 1333.0
Filled 848 missing values in 'Distance' with median: 544.0
Filled 361909 missing values in 'LateAircraftDelay' with median: 0.0
Filled 361909 missing values in 'NASDelay' with median: 0.0
Filled 361909 missing values in 'SecurityDelay' with 

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_clean[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_clean[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values

Filled 151751 missing values in 'TaxiIn' with median: 5.0
Filled 151697 missing values in 'TaxiOut' with median: 13.0
Filled 361909 missing values in 'WeatherDelay' with median: 0.0
Filled 497052 missing values in 'CancellationCode' with mode: 'A'
Filled 151691 missing values in 'TailNum' with mode: 'UNKNOW'

=== OUTLIER DETECTION FOR AIRLINE ===
Column 'ActualElapsedTime': 21496 outliers detected (bounds: -50.00 to 270.00)
Column 'AirTime': 49395 outliers detected (bounds: 4.50 to 168.50)
Column 'ArrDelay': 40704 outliers detected (bounds: -34.00 to 38.00)
Column 'CRSElapsedTime': 20878 outliers detected (bounds: -50.50 to 273.50)
Column 'Cancelled': 9465 outliers detected (bounds: 0.00 to 0.00)
Column 'CarrierDelay': 15060 outliers detected (bounds: 0.00 to 0.00)
Column 'DepDelay': 67224 outliers detected (bounds: -14.00 to 18.00)
Column 'Distance': 23814 outliers detected (bounds: -636.50 to 1879.50)
Column 'Diverted': 1174 outliers detected (bounds: 0.00 to 0.00)
Column 'FlightNum'

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_clean[col].fillna(mode_val[0], inplace=True)


In [20]:
df_airline_clean.head()

Unnamed: 0,ActualElapsedTime,AirTime,ArrDelay,ArrTime,CRSArrTime,CRSDepTime,CRSElapsedTime,CancellationCode,Cancelled,CarrierDelay,...,Month,NASDelay,Origin,SecurityDelay,TailNum,TaxiIn,TaxiOut,UniqueCarrier,WeatherDelay,Year
0,53.0,32.0,-8.0,1642.0,1650,1545,65.0,A,0,0.0,...,10,0.0,DCA,0.0,N443US,7.0,14.0,US,0.0,2002
1,164.0,155.0,-11.0,1754.0,1805,1610,175.0,A,0,0.0,...,12,0.0,MCO,0.0,N755,2.0,7.0,WN,0.0,1999
2,60.0,84.0,15.0,2005.0,1950,1850,60.0,A,0,0.0,...,12,0.0,ATL,0.0,UNKNOW,5.0,13.0,DL,0.0,1993
3,51.0,84.0,-5.0,1818.0,1823,1728,55.0,A,0,0.0,...,9,0.0,MEM,0.0,UNKNOW,5.0,13.0,AA,0.0,1989
4,45.0,29.0,2.0,1120.0,1118,1030,48.0,A,0,0.0,...,6,0.0,CVG,0.0,N785CA,3.0,13.0,OH,0.0,2006


In [21]:
df_carriers_clean.head()

Unnamed: 0,Code,Description
0,02Q,Titan Airways
1,04Q,Tradewind Aviation
2,05Q,"Comlux Aviation, AG"
3,06Q,Master Top Linhas Aereas Ltd.
4,07Q,Flair Airlines Ltd.
