# Airline Data Cleaning
This notebook cleans and standardizes the airline data, including:
- Consolidating Callsign information into one column
- Standardizing fleet size information
- Cleaning hub and base information
- Consolidating IATA codes

In [3]:
# Import required libraries
import pandas as pd
import numpy as np
from datetime import datetime
import logging
# Read the CSV file
df = pd.read_csv('filepath')
print("Original data shape:", df.shape)

Original data shape: (10, 28)


In [10]:
class AirlineDataCleaner:
   def __init__(self, df):
       self.df = df.copy()
       logging.basicConfig(level=logging.INFO)
   
   def standardize_date(self, date_str):
       """
       Standardizes dates to DD/MM/YYYY format.
       Handles various input formats including:
       - DD/MM/YYYY
       - DD MMM YYYY
       - MMM-YY
       """
       if pd.isna(date_str):
           return pd.NA
       
       date_str = str(date_str).strip()
       try:
           # Handle "Dec-57" format
           if '-' in date_str:
               month_year = date_str.split('-')
               if len(month_year) == 2:
                   month = month_year[0].strip()
                   year = month_year[1].strip()
                   # Convert two-digit year to four digits
                   if len(year) == 2:
                       year = '19' + year if int(year) >= 30 else '20' + year
                   # Convert month name to number using datetime
                   try:
                       datetime_obj = datetime.strptime(month, '%b')
                       month_num = datetime_obj.month
                       return f"01/{month_num:02d}/{year}"
                   except ValueError:
                       logging.warning(f"Could not parse month: {month}")
                       return date_str
           
           # Try different date formats
           for fmt in ['%d/%m/%Y', '%d %b %Y', '%d %B %Y']:
               try:
                   return datetime.strptime(date_str, fmt).strftime('%d/%m/%Y')
               except ValueError:
                   continue
           
           raise ValueError(f"Could not parse date: {date_str}")
           
       except Exception as e:
           logging.warning(f"Could not standardize date: {date_str}. Error: {e}")
           return date_str

   def extract_callsign(self):
       """
       Extracts and consolidates callsign information into a single column
       """
       self.df['Callsign'] = None
       
       # Find columns with 'Callsign' in them
       for column in self.df.columns:
           mask = self.df[column].astype(str).str.contains('Callsign', na=False)
           if mask.any():
               self.df.loc[mask, 'Callsign'] = (
                   self.df.loc[mask, column]
                   .str.extract(r'Callsign(.*)', expand=False)
                   .str.strip()
               )
               # Clear the original column after extraction
               self.df.loc[mask, column] = None
               
       # Remove empty columns that previously contained callsigns
       for col in self.df.columns:
           if self.df[col].isna().all():
               self.df.drop(col, axis=1, inplace=True)
       
       return self
   
   def clean_fleet_size(self):
       """
       Standardizes fleet size format and extracts ordered aircraft
       """
       def extract_fleet_numbers(value):
           if pd.isna(value):
               return pd.NA, pd.NA
           
           fleet = pd.to_numeric(str(value).split('Aircraft')[0].strip(), errors='coerce')
           ordered = pd.NA
           if '(' in str(value):
               ordered = pd.to_numeric(
                   str(value).split('(')[1].split(')')[0]
                   .replace('+ ', '').replace(' On Order/Planned', '')
                   .strip(), 
                   errors='coerce'
               )
           return fleet, ordered
           
       if 'Fleet Size' in self.df.columns:
           fleet_data = self.df['Fleet Size'].apply(extract_fleet_numbers)
           self.df['Current Fleet Size'] = fleet_data.apply(lambda x: x[0])
           self.df['Aircraft On Order'] = fleet_data.apply(lambda x: x[1])
           self.df.drop('Fleet Size', axis=1, inplace=True)
       return self
   
   def clean_hubs_bases(self):
       """
       Consolidates hub information
       """
       hub_columns = ['Main Hub', 'Base / Main Hub', 'Bases']
       self.df['Hubs'] = None
       
       for col in hub_columns:
           if col in self.df.columns:
               mask = self.df['Hubs'].isna() & self.df[col].notna()
               self.df.loc[mask, 'Hubs'] = self.df.loc[mask, col]
               self.df.drop(col, axis=1, inplace=True)
       return self

   def clean_dates(self):
       """
       Clean and standardize date fields
       """
       date_columns = ['Founded', 'Started Operations']
       for col in date_columns:
           if col in self.df.columns:
               self.df[col] = self.df[col].apply(self.standardize_date)
               # Print the dates for verification
               print(f"\nStandardized {col} dates:")
               print(self.df[['Airline Name', col]].to_string())
       return self
   
   def clean_all(self):
       """
       Applies all cleaning steps
       """
       return (self
               .extract_callsign()
               .clean_fleet_size()
               .clean_hubs_bases()
               .clean_dates()
               .df)

# Example usage
if __name__ == "__main__":
   # Read the CSV file
   df = pd.read_csv('filepath')
   
   # Clean the data
   cleaner = AirlineDataCleaner(df)
   cleaned_df = cleaner.clean_all()
   
   # Display results
   print("\nCleaned Data Sample:")
   print(cleaned_df[['Airline Name', 'Callsign', 'Founded', 'Started Operations', 'Current Fleet Size', 'Aircraft On Order', 'Hubs']].to_string())
   
   # Save cleaned data
   cleaned_df.to_csv('cleaned_airline_data.csv', index=False)
   print("\nCleaned data saved to 'cleaned_airline_data.csv'")




Standardized Founded dates:
                    Airline Name     Founded
0                  Qatar Airways  22/11/1993
1             Singapore Airlines  01/05/1947
2              Emirates Airlines  25/03/1985
3             All Nippon Airways  27/12/1952
4         Cathay Pacific Airways  24/09/1946
5                 Japan Airlines  01/08/1951
6               Turkish Airlines  20/05/1933
7                        EVA Air  08/03/1989
8                     Air France  07/10/1933
9  Swiss International Air Lines  31/03/2002

Standardized Started Operations dates:
                    Airline Name Started Operations
0                  Qatar Airways         20/01/1994
1             Singapore Airlines               <NA>
2              Emirates Airlines         25/10/1985
3             All Nippon Airways           Dec 1957
4         Cathay Pacific Airways               <NA>
5                 Japan Airlines         25/10/1951
6               Turkish Airlines               <NA>
7                   