In [1]:
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
from dateutil import relativedelta
import autopep8
import os
import re 
import cProfile
from typing import IO

In [2]:
'''
The goal of this script is to select a to be cleaned file . This is done by selecting a file index ranging from 0 . 
In this way , a user is able to choose a file of their choice for further cleaning and Data manipulation . 
'''

basePath = r"I:\Flight Enrichment Assignment"

files = ["CGN0322S", "CGN1025S"]

file_index = 0
if file_index < len(files):

    file = files[file_index]

    filepath = os.path.join(basePath, file)

    print(filepath)
else:
    print("This specific file is not available")

I:\Flight Enrichment Assignment\CGN1025S


In [3]:
def import_fs(filepath) -> IO[str]:
    '''
    Checks for a specific file path and fetches it into a data frame for further 
    data cleaning manipulation . 
    '''
    FS = pd.read_excel(filepath,
                       dtype={
                           "Days Of operations": str,
                           "Time": str,
                           "Aircraft Type": str
                       })
    return FS

In [4]:
def convert_timestr(fs: IO[str]) -> IO[str]:
    '''
    Corrects Time to a normal HH:MM time format and converts it a normal time representation.FS stands for flight schedule
    '''
    # Correct the Time format to HH:MM:SS

    fs["Time"] = fs["Time"].str[:2] + ":" + fs["Time"].str[2:] + ":" + "00"
    return fs

In [5]:
def calculate_timeaftermidnight(fs: IO[str]) -> IO[str]:
    '''
    Converts 'Time' in the format HH:MM:SS" to minutes after midnight.
    '''

    # Convert 'Time' column to datetime format, now compatible with both formats
    fs['Time'] = pd.to_datetime(fs['Time'])

    # Calculate minutes after midnight
    fs["time_aftermidnight"] = fs['Time'].dt.hour * 60 + fs['Time'].dt.minute
    return fs

In [6]:
def filter_stype_CJ(fs: IO[str]) -> IO[str]:
    '''
    Selects only Flight schedules whose service type is J and C. 
    '''
    # Create a boolean mask where True indicates rows where "Service type" is "J" or "C"
    mask = fs["Service type"].apply(lambda x: x == "J" or x == "C")

    # Use the mask to filter rows in FS
    return fs[mask]

In [7]:
def count_operatingweeklytime(fs: IO[str]) -> IO[str]:
    """
    Counts the number of times a flight operates on a weekly basis.
    """

    fs["Operating Times Per Week"] = fs['Days Of operations'].apply(
        lambda x: sum(1 for char in x if char != '0'))

    return fs

In [8]:
def convert_dates(fs: IO[str]) -> IO[str]:
    '''
    Converts a data range to days , this parameter comes in handy during data range weight application. 
    '''
    fs["Start Date"] = pd.to_datetime(fs["Start Date"], format="%Y%m%d")

    fs["End Date"] = pd.to_datetime(fs["End Date"], format="%Y%m%d")

    fs["Date Range"] = fs.apply(
        lambda row: row["End Date"] - row["Start Date"], axis=1).dt.days
    return fs

In [9]:
def reduce_numberofcolumns(fs: IO[str]) -> IO[str]:
    # List of columns to be selected
    columns = [
        "Airline_Code", "Leg Type", "Via", "Origin-Destination","Start Date","End Date",
        'Days Of operations', "Operating Times Per Week", "time_aftermidnight",
        "Date Range"
    ]

    fs = fs[columns]

    return fs

In [10]:
def removespaces(fs: IO[str]) -> IO[str]:
    columns_fs = [
        "Airline_Code", "Local Airport", "Leg Type", "Via",
        "Origin-Destination"
    ]
    for column in columns_fs:
        fs[column] = fs[column].apply(lambda x: x.strip())
    return fs

In [11]:
def import_flightschedule(file):
    '''
    In order to understand where script issues are rising from , I created an output of None indicating 
    weather a File Schedule is empty or not . 
    '''
    srcpath = os.path.join(basePath, "data_raw",
                           f"{file}.xlsx")  # stands for source path
    print("The source path is ", srcpath)
    dstpath = os.path.join(basePath, "data",
                           f"{file}.pkl")  # stands for destinaion path
    print("The destination path is", dstpath)

    fs = import_fs(srcpath)
    print("After ImportFS:", fs is None)

    fs = convert_timestr(fs)
    print("After Timestr:", fs is None)

    fs = calculate_timeaftermidnight(fs)
    print("After TimeafterMidnight:", fs is None)

    fs = filter_stype_CJ(fs)
    print("After Servicetype:", fs is None)

    fs = count_operatingweeklytime(fs)
    print("After aircrafttype:", fs is None)

    fs = convert_dates(fs)
    print("After daterange:", fs is None)

    fs = removespaces(fs)
    print("After all spaces have been removed:", fs is None)

    fs = reduce_numberofcolumns(fs)
    print("After newfs:", fs is None)
    '''
    #The file is saved in a pickle format on the provided file path . 
    '''

    fs.to_pickle(dstpath)

In [12]:
if __name__ == "__main__":
    import_flightschedule(file)

The source path is  I:\Flight Enrichment Assignment\data_raw\CGN1025S.xlsx
The destination path is I:\Flight Enrichment Assignment\data\CGN1025S.pkl
After ImportFS: False
After Timestr: False
After TimeafterMidnight: False
After Servicetype: False
After aircrafttype: False
After daterange: False
After all spaces have been removed: False
After newfs: False


In [13]:
cProfile.run('import_flightschedule(file)')

The source path is  I:\Flight Enrichment Assignment\data_raw\CGN1025S.xlsx
The destination path is I:\Flight Enrichment Assignment\data\CGN1025S.pkl
After ImportFS: False
After Timestr: False
After TimeafterMidnight: False
After Servicetype: False
After aircrafttype: False
After daterange: False
After all spaces have been removed: False
After newfs: False
         13367576 function calls (13366895 primitive calls) in 15.679 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.009    0.009 1240402559.py:1(reduce_numberofcolumns)
        1    0.001    0.001    0.052    0.052 1351075272.py:1(convert_timestr)
        1    0.000    0.000   13.285   13.285 2179123048.py:1(import_fs)
        1    0.000    0.000    0.021    0.021 2845491082.py:1(filter_stype_CJ)
    23997    0.004    0.000    0.004    0.000 2845491082.py:6(<lambda>)
        1    0.000    0.000    0.069    0.069 2876237673.py:1(count_oper