# M03A

M03A資料可作為統計通過門架ID對應的**通過量**  
可以分析路段的道路服務水準

## Setup

In [None]:
import os
import requests
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import subprocess
import shutil
import tarfile

In [6]:
def create_folder(folder_name):
    """建立資料夾"""
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
    return os.path.abspath(folder_name)

def delete_folders(deletelist):
    """
    刪除資料夾
    deletelist(list):需要為皆為路徑的list
    """
    for folder_name in deletelist: 
        if os.path.exists(folder_name): # 檢查資料夾是否存在
            shutil.rmtree(folder_name) # 刪除資料夾及其內容
        else:
            print(f"資料夾 '{folder_name}' 不存在。")

def getdatelist(time1, time2):
    '''
    建立日期清單
    time1、time2(str):為%Y-%M-%D格式的日期字串
    '''
    if time1 > time2:
        starttime = time2
        endtime = time1
    else:
        starttime = time1
        endtime = time2

    date_range = pd.date_range(start=starttime, end=endtime)
    datelist = [d.strftime("%Y%m%d") for d in date_range]
    return datelist

def delete_folders_permanently(deletelist):
    """
    永久刪除資料夾及其內容，不放入資源回收筒
    deletelist (list): 需要刪除的資料夾路徑列表
    """
    for item in deletelist:
        if os.path.isdir(item):  # 檢查是否為資料夾
            try:
                shutil.rmtree(item)  # 永久刪除資料夾
                print(f"已永久刪除資料夾： {item}")
            except OSError as e:
                print(f"刪除資料夾 {item} 時發生錯誤： {e}")
        elif os.path.isfile(item):  # 檢查是否為檔案
            try:
                os.remove(item)  # 永久刪除檔案
                print(f"已永久刪除檔案： {item}")
            except OSError as e:
                print(f"刪除檔案 {item} 時發生錯誤： {e}")
        else:
            print(f"{item} 不是檔案或資料夾。")

def extract_tar_gz(tar_gz_file, extract_path):
    try:
        with tarfile.open(tar_gz_file, 'r:gz') as tar:
            tar.extractall(path=extract_path)
    except Exception as e:
        print(f"解壓縮 {tar_gz_file} 失敗：{e}")

def download_and_extract(url, datatype, date, downloadfolder, keep = False):
    '''針對高公局交通資料庫的格式進行下載'''
    downloadurl = f"{url}/{datatype}_{date}.tar.gz"
    destfile = os.path.join(downloadfolder, f"{datatype}_{date}.tar.gz")

    response = requests.get(downloadurl)
    with open(destfile, 'wb') as file:
        file.write(response.content)

    extractpath = create_folder(os.path.join(downloadfolder, date))
    extract_tar_gz(destfile, extractpath)
    if keep == False:
        os.remove(destfile)

    return extractpath

def findfiles(filefolderpath, filetype='.csv'):
    """
    尋找指定路徑下指定類型的檔案，並返回檔案路徑列表。

    Args:
        filefolderpath (str): 指定的檔案路徑。
        filetype (str, optional): 要尋找的檔案類型，預設為 '.csv'。

    Returns:
        list: 包含所有符合條件的檔案路徑的列表。
    """

    filelist = []  # 建立一個空列表來儲存檔案路徑

    # 使用 os.walk 遍歷資料夾及其子資料夾
    for root, _, files in os.walk(filefolderpath):
        for file in files:
            if file.endswith(filetype):  # 檢查檔案是否以指定類型結尾
                file_path = os.path.join(root, file)  # 建立完整的檔案路徑
                filelist.append(file_path)  # 將檔案路徑添加到列表中

    return filelist

def combinefile(filelist, datatype='M03A'):
    """
    更有效率地合併多個CSV檔案。

    Args:
        filelist (list): 包含CSV檔案路徑的列表。
        datatype (str, optional): 資料類型，決定欄位名稱。預設為 'M03A'。

    Returns:
        pandas.DataFrame: 合併後的DataFrame。
    """

    # 使用字典來映射資料類型和欄位名稱，避免重複的 if/elif 判斷
    column_mapping = {
        'M03A': ['TimeStamp', 'GantryID', 'Direction', 'VehicleType', 'Volume'],
        'M04A': ['TimeStamp', 'GantryFrom', 'GantryTo', 'VehicleType', 'TravelTime', 'Volume'],
        'M05A': ['TimeStamp', 'GantryFrom', 'GantryTo', 'VehicleType', 'Speed', 'Volume'],
        'M06A': ['VehicleType', 'DetectionTimeO', 'GantryO', 'DetectionTimeD', 'GantryD', 'TripLength', 'TripEnd', 'TripInformation'],
        'M07A': ['TimeStamp', 'GantryO', 'VehicleType', 'AverageTripLength', 'Volume'],
        'M08A': ['TimeStamp', 'GantryO', 'GantryD', 'VehicleType', 'Trips']
    }

    columns = column_mapping.get(datatype)  # 使用 get() 方法，如果找不到鍵，會返回 None
    if columns is None:
        raise ValueError(f"未知的資料類型：{datatype}")

    combineddf = pd.concat(
        (pd.read_csv(i, header=None, names=columns) for i in filelist),  # 使用生成器表達式
        ignore_index=True  # 避免重複的索引
    )

    return combineddf

In [None]:
'''
待改進
def combinefile(filelist, datatype = 'M03A'):
    if datatype == 'M03A':
        columns = ['TimeStamp', 'GantryID', 'Direction', 'VehicleType', 'Volume']
    elif datatype == 'M04A':
        columns = ['TimeStamp', 'GantryFrom', 'GantryTo', 'VehicleType', 'TravelTime', 'Volume']
    elif datatype == 'M05A':
        columns = ['TimeStamp', 'GantryFrom', 'GantryTo', 'VehicleType', 'Speed', 'Volume']
    elif datatype == 'M06A':
        columns = ['VehicleType', 'DetectionTimeO', 'GantryO',  'DetectionTimeD', 'GantryD', 'TripLength', 'TripEnd', 'TripInformation']
    elif datatype == 'M07A':
        columns = ['TimeStamp', 'GantryO', 'VehicleType', 'AverageTripLength', 'Volume']
    elif datatype == 'M08A':
        columns = ['TimeStamp',	'GantryO', 'GantryD', 'VehicleType', 'Trips']
    
    combineddf = []
    for i in filelist:
        df = pd.read_csv(i, header=None)
        df.columns = columns
        combineddf.append(df)
    combineddf = pd.concat(combineddf)

    return combineddf
'''

## 需要調整的參數

In [3]:
# ===== Step 0: 手動需要調整的參數 =====

# 需要調整的項目有2個
# 1. 調整需要確認下載的資料型態是什麼
datatype = "M03A"  # Data type (e.g., M03A, M06A, M05A) 

# 2. 調整下載的資料區間
starttime = "2024-07-16"
endtime = "2024-07-19"
datelist = getdatelist(endtime,starttime) # 下載的時間區間清單

# 建立後續要處理儲存資料的資料夾位置
savelocation = create_folder(os.path.join(os.getcwd(), datatype))
rawdatafolder = create_folder(os.path.join(savelocation, '0_rawdata'))
mergefolder = create_folder(os.path.join(savelocation, '1_merge'))
excelfolder = create_folder(os.path.join(savelocation, '2_excel'))
basicurl = "https://tisvcloud.freeway.gov.tw/history/TDCS/"
url = basicurl + datatype

In [12]:
# delete_folders([savelocation])

# delete_folders_permanently([savelocation])

## 程式執行

In [28]:
# 1. 下載並解壓縮
date = datelist[0]
dowloadfilefolder = download_and_extract(url = url, datatype = datatype, date = date, downloadfolder = rawdatafolder)

# 2. 合併
filelist = findfiles(filefolderpath=dowloadfilefolder, filetype='.csv')
df = combinefile(filelist=filelist, datatype=datatype)
mergeoutputfolder = create_folder(os.path.join(mergefolder, date)) # 建立相同日期的資料夾進行處理
df.to_csv(os.path.join(mergeoutputfolder, f'{date}.csv') , index = False) # 輸出整併過的csv
delete_folders([dowloadfilefolder]) #回頭刪除解壓縮過的資料

# 3. 處理



In [24]:
df

Unnamed: 0,TimeStamp,GantryID,Direction,VehicleType,Volume
0,2024-07-16 03:55,01F0005N,N,31,10
1,2024-07-16 03:55,01F0005N,N,32,13
2,2024-07-16 03:55,01F0005N,N,41,0
3,2024-07-16 03:55,01F0005N,N,42,0
4,2024-07-16 03:55,01F0005N,N,5,1
...,...,...,...,...,...
1690,2024-07-16 22:50,05FR143N,N,31,4
1691,2024-07-16 22:50,05FR143N,N,32,0
1692,2024-07-16 22:50,05FR143N,N,41,0
1693,2024-07-16 22:50,05FR143N,N,42,0


In [18]:
df = pd.read_csv(filelist[0], header=['TimeStamp', 'GantryID', 'Direction', 'VehicleType', 'Volume'])
df.head()

ValueError: header must be integer or list of integers

In [None]:
destfile

In [None]:
del destfile

In [None]:
os.getcwd()

In [None]:
downloadurl

In [None]:
import os
import requests
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import subprocess
import shutil

######################## Step 0: Basic Settings ############################
location = os.getcwd()
data_type = "M06A"  # Data type (e.g., M03A, M06A, M05A)
savelocation = os.path.join(location, data_type)
os.makedirs(savelocation, exist_ok=True)

# Subfolder for raw data
savelocation_origin = os.path.join(location, data_type, "0_rawdata")
os.makedirs(savelocation_origin, exist_ok=True)

# Subfolder for merged data
savelocation_merge = os.path.join(location, data_type, "1_merge")
os.makedirs(savelocation_merge, exist_ok=True)

# Subfolder for Excel files
savelocation_excel = os.path.join(location, data_type, "2_excel")
os.makedirs(savelocation_excel, exist_ok=True)

url = "https://tisvcloud.freeway.gov.tw/history/TDCS/"
urllocation = url + data_type

print(f"Main folder: {savelocation}")
print(f"Raw data folder: {savelocation_origin}")
print(f"Merged data folder: {savelocation_merge}")
print(f"Excel files folder: {savelocation_excel}")
print(f"Download URL: {urllocation}")

######################## Step 1: Time Range ########################
Start_Time = "2024-08-01"
End_Time = "2024-08-01"
date_range = pd.date_range(start=Start_Time, end=End_Time)
date = [d.strftime("%Y%m%d") for d in date_range]

######################## Step 2: Download M06A and Organized ########################

# Process to excel
def M06A_tohour(df):
    df.columns = ['VehicleType', 'DetectionTimeO', 'GantryO', 'DetectionTimeD', 'GantryD', 'TripLength', 'TripEnd']
    df = df[df['TripEnd'] == 'Y']
    df['DetectionTimeO'] = pd.to_datetime(df['DetectionTimeO'])
    df['DataHour'] = df['DetectionTimeO'].dt.hour
    df['DataDate'] = df['DetectionTimeO'].dt.date
    df_output = df.groupby(['DataDate', 'DataHour','GantryO', 'GantryD', 'VehicleType' ]).size().reset_index(name='VehicleCount')
    return df_output


for i in range(len(date)):
    os.chdir(savelocation_origin)

    # Data download
    downloadurl = f"{urllocation}/{data_type}_{date[i]}.tar.gz"
    destfile = f"{data_type}_{date[i]}.tar.gz"
    print(f"Downloading {destfile}...")
    response = requests.get(downloadurl)
    with open(destfile, 'wb') as file:
        file.write(response.content)

    # Unzip and delete compressed file
    print("Extracting files...")
    subprocess.run([r"C:\Program Files\7-Zip\7zG.exe", "x", destfile, f"-o{savelocation_origin}"])
    subprocess.run([r"C:\Program Files\7-Zip\7zG.exe", "x", destfile.replace(".gz", ""), f"-o{savelocation_origin}"])
    os.remove(destfile)
    os.remove(destfile.replace(".gz", ""))

    # Merge hourly data
    print("Merging hourly data...")
    df = pd.DataFrame(columns=['VehicleType', 'DetectionTimeO', 'GantryO', 'DetectionTimeD', 'GantryD', 'TripLength', 'TripEnd'])
    path = os.path.join(savelocation_origin, data_type, date[i])
    hour = os.listdir(path)

    for j in range(len(hour)):
        path2 = os.path.join(path, hour[j])
        files = os.listdir(path2)
        for k in range(len(files)):
            read_path = os.path.join(path2, files[k])
            # M06A = pd.read_csv(read_path, header=None, names=['VehicleType', 'DetectionTimeO', 'GantryO', 'DetectionTimeD', 'GantryD', 'TripLength', 'TripEnd','TripInformation'])
            M06A = pd.read_csv(
                    read_path,
                    header=None,
                    names=['VehicleType', 'DetectionTimeO', 'GantryO', 'DetectionTimeD', 'GantryD', 'TripLength', 'TripEnd'],
                    usecols=[0, 1, 2, 3, 4, 5, 6] 
                )
            df = pd.concat([df, M06A], ignore_index=True)
        print(f"Processing hour {j+1}/{len(hour)}", end='\r')

    export_path = os.path.join(savelocation_merge, f"{date[i]}.csv")
    df.to_csv(export_path, index=False)
    print(f"\nSaved merged data to {export_path}")


    # Organize Hourly counts
    print("Organizing vehicle types...")
    df_hour = M06A_tohour(df)

    # Save to Excel
    export_path = os.path.join(savelocation_excel, f"{date[i]}.xlsx")
    df_hour.to_excel(export_path, index=False)
    print(f"Saved Excel file to {export_path}")

    print(f"Completed processing for date: {date[i]}\n")

print("All processing completed.")
 