In [None]:
import pandas as pd
import os
import datetime
from datetime import timedelta, time

In [None]:
# 選擇檔案目錄
dir_path = 'Data_preprocessings/'

# 取得目錄下的檔案
file_names = os.listdir(dir_path)

# 另存新檔,建立目錄
folderPath = 'Data_clears'
if not os.path.exists(folderPath):
    os.makedirs(folderPath)
    
# columnName list
col_name = ["airline_code",
            "Date (MM/DD/YYYY)",
            "flight_num",
            "tail_num",
            "dest_airport",
            "depa_airport",
            "sche_arriv_time",
            "actual_arriv_time",
            "sche_duration",
            "actu_duration",
            "delay_depa_time",
            "Wheels-off time",
            "Taxi-Out time (Minutes)",
            "delay_carrier_time",
            "delay_weather_time",
            "delay_NAS_time",
            "delay_security_time",
            "delay_late_arrival_time"]

# new columnName list
new_columns = ["airline_code",
               "flight_num",
               "tail_num",
               "depa_airport",
               "dest_airport",
               "depa_year",
               "depa_month",
               "depa_date",
               "sche_depa_hr",
               "sche_depa_min",
               "actu_depa_hr",
               "actu_depa_min",
               "dest_year",
               "dest_month",
               "dest_date",
               "dest_hr",
               "dest_min",
               "sche_duration",
               "actu_duration",
               "delay_total_time",
               "delay_depa_time",
               "delay_dest_time",
               "delay_carrier_time",
               "delay_weather_time",
               "delay_NAS_time",
               "delay_security_time",
               "delay_late_arrival_time"]

# loading csv，header = 0 -> 代表可以覆蓋檔名
for file_names in file_names:
    
    df = pd.read_csv(f'{dir_path}{file_names}', names = col_name, header = 0, dtype = {"flight_num": int})

    # data欄位檢查
    # df.dtypes
    # df.shape
    # df.columns

    # 將float轉為str，航班號碼欄位應是字串，不可被計算
    df["flight_num"] = df["flight_num"].astype(str)

    # 刪除 Wheels-off time & Taxi-Out time (Minutes)
    df = df.drop(["Wheels-off time", "Taxi-Out time (Minutes)"], axis = 1)


    # 將 Tail Number 開頭的N刪除
    df["tail_num"] = df["tail_num"].str.lstrip("N")

    # 將 actual_arriv_time等於24:00的替換成00:00
    df["actual_arriv_time"] = df["actual_arriv_time"].str.replace("24:00", "00:00")

    # 把Date拆分為year, month, day
    # 把sche_arriv_time拆分為sche_depa_time_hh、sche_depa_mm
    # 把actual_arriv_time拆分為actu_depa_time_hh、acut_depa_mm
    df[["depa_month", "depa_date", "depa_year"]] = df["Date (MM/DD/YYYY)"].str.split(pat="/", expand = True)
    df[["sche_depa_hr", "sche_depa_min"]] = df["sche_arriv_time"].str.split(pat=":", expand = True)
    df[["actu_depa_hr", "actu_depa_min"]] = df["actual_arriv_time"].str.split(pat=":", expand = True)

    # 新增的欄位
    # 實際抵達_年，尚未計算
    df["dest_year"] = df["depa_year"]
    # 實際抵達_月，尚未計算
    df["dest_month"] = df["depa_month"]
    # 實際抵達_日，尚未計算
    df["dest_date"] = df["depa_date"]
    # 實際抵達_時，尚未計算
    df["dest_hr"] = df["actu_depa_hr"]
    # 實際抵達_分，尚未計算
    df["dest_min"] = df["actu_depa_min"]
    
    # 計算實際抵達時間，用出發年月日跟實際出發時間+實際飛行時間
    for i in range(len(df)):
        con_date = datetime.datetime(int(df["depa_year"][i]),
                                     int(df["depa_month"][i]),
                                     int(df["depa_date"][i]),
                                     int(df["actu_depa_hr"][i]),
                                     int(df["actu_depa_min"][i]))
        con_date += timedelta(minutes = df["actu_duration"][i])
        df["dest_year"][i] = con_date.year
        df["dest_month"][i] = con_date.month
        df["dest_date"][i] = con_date.day
        df["dest_hr"][i] = con_date.hour
        df["dest_min"][i] = con_date.minute

    # 延誤抵達 = 實際飛行 - 預計飛行
    df["delay_dest_time"] = df["actu_duration"] - df["sche_duration"]
    # 總延誤 = 延誤出發 + 延誤抵達
    df["delay_total_time"] = df["delay_depa_time"] + df["delay_dest_time"]


    # columns清整後刪除原先欄位
    # 刪除Date, 表定出發, 實際出發欄位
    df = df.drop(["Date (MM/DD/YYYY)", 
                  "sche_arriv_time", 
                  "actual_arriv_time"], axis = 1)

    # columns重新排序
    df = df.reindex(columns = new_columns)

    #寫出檔案 不要標頭 不要index
    df.to_csv(f'{folderPath}/{file_names}',index=None)
