In [1]:
import pandas as pd
pd.set_option('display.max_columns', 200)

In [2]:
df = pd.read_csv("../data/samples/sample.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 652531 entries, 0 to 652530
Data columns (total 61 columns):
 #   Column                                   Non-Null Count   Dtype  
---  ------                                   --------------   -----  
 0   FlightDate                               652531 non-null  object 
 1   Airline                                  652531 non-null  object 
 2   Origin                                   652531 non-null  object 
 3   Dest                                     652531 non-null  object 
 4   Cancelled                                652531 non-null  bool   
 5   Diverted                                 652531 non-null  bool   
 6   CRSDepTime                               652531 non-null  int64  
 7   DepTime                                  633490 non-null  float64
 8   DepDelayMinutes                          633476 non-null  float64
 9   DepDelay                                 633476 non-null  float64
 10  ArrTime                         

In [3]:
# I could not figure out why I can't import src modules, so I just copy pasted data_transformations.py here (ref a3b9a6a9eed2)
"""Dataframe type for typings"""

from datetime import datetime
import hashlib
import pandas as pd
from pandas import DataFrame

required = [
    "FlightDate",
    "Cancelled",
    "OriginAirportID",
    "DepTime",
    "DepDelay",
    "DestAirportID",
    "ArrTime",
    "ArrDelay",
    "AirTime",
    "Distance",
    "ActualElapsedTime",
    "Operating_Airline",
    "Tail_Number",
]


def pull_features(df: DataFrame):
    """
    Extract only the required features from the dataframe
    """
    # Check that the required columns are there
    for c in required:
        if c not in df.columns:
            raise ValueError(
                f"Dataframe lacks one or more of the required columns: {c}"
            )
    pulled_df = df.copy()
    remaining_cols = set(df.columns) - set(required)

    pulled_df.drop(list(remaining_cols), axis=1, inplace=True)

    # Fix types
    df["Tail_Number"] = df["Tail_Number"].astype("str")

    # for c in [
    #     "DepTime",
    #     "DepDelay",
    #     "ArrTime",
    #     "ArrDelay",
    # ]:
    #     df[c] = df[c].astype("int64")
    return df


def str2date(df: DataFrame) -> DataFrame:
    """Transform FlightDate column from str to date.
    Transformations occur in-place

    Args:
        df (DataFrame): Source dataframe

    Returns:
        DataFrame: Source dataframe with FlightDate mapped to datetime datatype
    """
    # Check that the column exists
    if "FlightDate" not in df.columns:
        raise ValueError(
            "FlightDate column is expected in the dataframe, but not found"
        )

    # Check datatype
    if df["FlightDate"].dtype is str:
        raise ValueError("FlightDate column's datatype is not str")

    df["FlightDate"] = df["FlightDate"].map(lambda d: datetime.strptime(d, "%Y-%m-%d"))
    return df


def encode_op_airline(df: DataFrame) -> DataFrame:
    """Encode `Operating_Airline` with onehot encoding

    Args:
        df (DataFrame): Source dataframe

    Returns:
        Source dataframe with `Operating_Airline` onehotencoded
    """
    # Check that the column exists
    if "Operating_Airline" not in df.columns:
        raise ValueError(
            "Operating_Airline column is expected in the dataframe, but not found"
        )

    # Check datatype
    if df["Operating_Airline"].dtype is str:
        raise ValueError("Operating_Airline column's datatype is not str")

    df = pd.get_dummies(df, columns=["Operating_Airline"])
    return df


def hash_tail_number(df: DataFrame) -> DataFrame:
    """Hash tail numbers

    Args:
        df (DataFrame): Source dataframe

    Returns:
        Source dataframe with `Tail_Number` hashed
    """

    # Check that the column exists
    if "Tail_Number" not in df.columns:
        raise ValueError(
            "Tail_Number column is expected in the dataframe, but not found"
        )

    # Check datatype
    if df["Tail_Number"].dtype is str:
        raise ValueError("Tail_Number column's datatype is not str")

    # Hashing with buckets
    def hash_feature(text, num_buckets=1000):
        return int(hashlib.md5(text.encode()).hexdigest(), 16) % num_buckets

    df["Tail_Number"] = df["Tail_Number"].map(hash_feature)
    return df


def sync_times(df: DataFrame) -> DataFrame:
    """
    Transform `DepTime` & `AirTime` columns to minutes

    Args:
        df (DataFrame): Source dataframe

    Returns:
        Source dataframe with `DepTime` & `AirTime` columns' time transformed to minutes
    """
    # Check that the column exists
    for c in ["DepTime", "AirTime"]:
        if c not in df.columns:
            raise ValueError(
                "[DepTime, AirTime] columns are expected in the dataframe, but not found"
            )

    # TODO idk how to check for dtypes and is it necessary
    # # Check datatype
    # for c in ["DepTime", "AirTime"]:
    #     if df[c].dtype != "int64":
    #         raise ValueError(f"`{c}` datatype is not `int64`")

    def hhmm2minutes(raw):
        hhmm = int(raw)
        strhhmm = str(hhmm).zfill(4)
        hour = int(strhhmm[:2])
        minutes = int(strhhmm[2:])

        return hour * 60 + minutes

    def avoidNaN(data):
        if not (data != data):
            return hhmm2minutes(data)
        return data

    for c in ["DepTime", "AirTime"]:
        df[c] = df[c].map(avoidNaN)

    return df


In [4]:
df = pull_features(df)
df = encode_op_airline(df)
df = hash_tail_number(df)
df = str2date(df)
df = sync_times(df) 

In [5]:
df.head(5)

Unnamed: 0,FlightDate,Airline,Origin,Dest,Cancelled,Diverted,CRSDepTime,DepTime,DepDelayMinutes,DepDelay,ArrTime,ArrDelayMinutes,AirTime,CRSElapsedTime,ActualElapsedTime,Distance,Year,Quarter,Month,DayofMonth,DayOfWeek,Marketing_Airline_Network,Operated_or_Branded_Code_Share_Partners,DOT_ID_Marketing_Airline,IATA_Code_Marketing_Airline,Flight_Number_Marketing_Airline,DOT_ID_Operating_Airline,IATA_Code_Operating_Airline,Tail_Number,Flight_Number_Operating_Airline,OriginAirportID,OriginAirportSeqID,OriginCityMarketID,OriginCityName,OriginState,OriginStateFips,OriginStateName,OriginWac,DestAirportID,DestAirportSeqID,DestCityMarketID,DestCityName,DestState,DestStateFips,DestStateName,DestWac,DepDel15,DepartureDelayGroups,DepTimeBlk,TaxiOut,WheelsOff,WheelsOn,TaxiIn,CRSArrTime,ArrDelay,ArrDel15,ArrivalDelayGroups,ArrTimeBlk,DistanceGroup,DivAirportLandings,Operating_Airline_9E,Operating_Airline_AA,Operating_Airline_AS,Operating_Airline_B6,Operating_Airline_C5,Operating_Airline_DL,Operating_Airline_F9,Operating_Airline_G4,Operating_Airline_G7,Operating_Airline_HA,Operating_Airline_MQ,Operating_Airline_NK,Operating_Airline_OH,Operating_Airline_OO,Operating_Airline_PT,Operating_Airline_QX,Operating_Airline_UA,Operating_Airline_WN,Operating_Airline_YV,Operating_Airline_YX,Operating_Airline_ZW
0,2022-07-17,Alaska Airlines Inc.,JFK,PDX,False,False,1145,699.0,0.0,-6.0,1442.0,0.0,204.0,365.0,363.0,2454.0,2022,3,7,17,7,AS,AS,19930,AS,857,19930,AS,374,857,12478,1247805,31703,"New York, NY",NY,36,New York,22,14057,1405702,34057,"Portland, OR",OR,41,Oregon,92,0.0,-1.0,1100-1159,31.0,1210.0,1434.0,8.0,1450,-8.0,0.0,-1.0,1400-1459,10,0,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,2022-05-24,Southwest Airlines Co.,AUS,FLL,False,False,720,458.0,18.0,18.0,1107.0,7.0,95.0,160.0,149.0,1105.0,2022,2,5,24,2,WN,WN,19393,WN,1771,19393,WN,119,1771,10423,1042302,30423,"Austin, TX",TX,48,Texas,74,11697,1169706,32467,"Fort Lauderdale, FL",FL,12,Florida,33,1.0,1.0,0700-0759,11.0,749.0,1104.0,3.0,1100,7.0,0.0,0.0,1100-1159,5,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
2,2022-06-09,SkyWest Airlines Inc.,LAX,RNO,False,False,949,587.0,0.0,-2.0,1109.0,0.0,65.0,91.0,82.0,391.0,2022,2,6,9,4,DL,DL_CODESHARE,19790,DL,3501,20304,OO,368,3501,12892,1289208,32575,"Los Angeles, CA",CA,6,California,91,14570,1457002,34570,"Reno, NV",NV,32,Nevada,85,0.0,-1.0,0900-0959,10.0,957.0,1102.0,7.0,1120,-11.0,0.0,-1.0,1100-1159,2,0,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False
3,2022-05-08,American Airlines Inc.,DCA,CLT,False,False,650,400.0,0.0,-10.0,821.0,0.0,64.0,93.0,101.0,331.0,2022,2,5,8,7,AA,AA,19805,AA,400,19805,AA,986,400,11278,1127805,30852,"Washington, DC",VA,51,Virginia,38,11057,1105703,31057,"Charlotte, NC",NC,37,North Carolina,36,0.0,-1.0,0600-0659,30.0,710.0,814.0,7.0,823,-2.0,0.0,-1.0,0800-0859,2,0,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,2022-06-09,JetBlue Airways,DCA,RSW,False,False,1304,801.0,17.0,17.0,1604.0,17.0,101.0,163.0,163.0,892.0,2022,2,6,9,4,B6,B6,20409,B6,481,20409,B6,10,481,11278,1127805,30852,"Washington, DC",VA,51,Virginia,38,14635,1463502,31714,"Fort Myers, FL",FL,12,Florida,33,1.0,1.0,1300-1359,16.0,1337.0,1558.0,6.0,1547,17.0,1.0,1.0,1500-1559,4,0,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 652531 entries, 0 to 652530
Data columns (total 81 columns):
 #   Column                                   Non-Null Count   Dtype         
---  ------                                   --------------   -----         
 0   FlightDate                               652531 non-null  datetime64[ns]
 1   Airline                                  652531 non-null  object        
 2   Origin                                   652531 non-null  object        
 3   Dest                                     652531 non-null  object        
 4   Cancelled                                652531 non-null  bool          
 5   Diverted                                 652531 non-null  bool          
 6   CRSDepTime                               652531 non-null  int64         
 7   DepTime                                  633490 non-null  float64       
 8   DepDelayMinutes                          633476 non-null  float64       
 9   DepDelay                  