In [2]:
import pandas as pd
import csv

In [43]:
csv_path = 'Full Text Info.csv'
# csv_path = 'y38_rebar.csv'

# 定義自定義解析器
def custom_csv_parser(file_path, encoding='ISO-8859-1'):
    rows = []
    with open(file_path, encoding=encoding) as file:
        reader = csv.reader(file)
        for row in reader:
            if len(row) > 8:
                row[7] = ', '.join(row[7:])
                row = row[:8]
            row[-1] = row[-1].rstrip(', ')
            rows.append(row)
    return rows

# 使用自定義解析器讀取文件
rows = custom_csv_parser(csv_path)

# 定義標題列
columns = ["FileName", "EntityName", "ObjectType", "RotationAngle", "CentreCoor", "Height", "Width", "Text"]

# 將結果轉換為 DataFrame
df = pd.DataFrame(rows[1:], columns=columns)

In [6]:
rows, columns = df.shape
for i in range(rows):
    print(df.iloc[i]['Text'])

ARE ALREADY INCLUDE IN COST.
BY THE  ENGINEER . THE  CONSTRUCTION LISTED ABOVE
CONSTRUCTION CAN  PROCEED ONLY AFTER APPROVAL
DETAILS  REGARDING  HIS  CONSTRUCTION  METHODS .
COULD  RE-DESIGN  REGARDING  THE  ADEQUACY  OF  THE
H
¾ã¤§³]­p´£°e¤uµ{¥q®Ö¥i¡A¬I¤u¶O¥Î¤w¥]§t¦b¤º¡C
¼t°Ó¥i¨Ì¹ê»Ú½c²[§ï«Ø¤§»Ý¨D¦Û¦æ­«·s³]­p¡A¥B±N½Õ
TRANSITION OF DRAINAGE CHANNEL . THE  CONTRACTOR
TYPE S1-1A ,  S1-2A  REQUIRE EMBEDDING H-SECTION
STEEL AS PIN PILE ,  TO MAINTAIN STABILITY DURING THE
DUE  TO  DRAINAGE CHANNEL ,  DIAPHRAGM  WALL
TYPE S1-1A ,  S1-2A
«¬¿û§@¬°¤¤¶¡¬W¡A¥Hºû«ù±Æ¤ô½c²[¾E²¾®É¤§Ã­©w©Ê¡A
°t¦X±Æ¤ô½c²[¾E²¾¡A                                ³sÄò¾À»Ý¹w®I
12.
D25@300
Sv  @300
Sh  @300
D13
Sv  @300
Sh  @300
D16
¤ô¥­µ¬
HORIZONTAL
°Å¤Oµ¬
SHEAR
¶}«õ°¼
EXCAVATED SIDE
¾×¤g°¼
RETAINED  SIDE
««ª½µ¬
VERTICAL
°tµ¬
REINFORCEMENT
DEPTH  (m)
BELOW
47
30
30
24.9
D36@150
D36@150 + D25@150
D36@150
24.9
50
30
30
»¡©ú
µ²§ô²`«×
¶}©l²`«×
NOTES
DEPTH  (m)
TOP
50
24.9
71.1
DIAPHRAGM  WALL  TYPE  S1-1A
50
46.25
64.2
50
73.9


In [50]:
grouped = df.groupby('FileName')

import re
import numpy as np
# 定義一個函數來提取型號
def extract_types(text: str) -> list:
    matches = re.findall(r'[A-Z]\d+-?\d?[AB]?', text)
    return matches

def parse_coordinate(coord_str: str) -> list:
    # 使用正則表達式提取坐標值
    match = re.match(r'\(([\d.]+)\s+([\d.]+)\s+([\d.]+)\)', coord_str)
    if match:
        return [float(match.group(1)), float(match.group(2)), float(match.group(3))]
    else:
        raise ValueError(f"無法解析坐標: {coord_str}")
    
def find_nearest(candidate_columns: str, x: float, y: float, df: pd.DataFrame) -> int:

    # 選擇有效的候選行（非空值）
    candidate_rows = df[df[candidate_columns].notna()]

    # 解析候選行的坐標
    candidate_coords = np.array([parse_coordinate(coord) for coord in candidate_rows['CentreCoor']])

    # 創建目標坐標
    target_coords = np.array([x, y])

    # 計算距離
    distances = np.sqrt(np.sum((candidate_coords[:, :2] - target_coords)**2, axis=1))

    # 找出最近的點
    nearest_index = np.argmin(distances)

    # 返回最近點的相應列
    return nearest_index

# 遍歷每個組
for name, group in grouped:
    print(f"組名: {name}")

    # 初始化儲存連續壁形式的列表
    type_name: list = []
    # 初始化每種類型連續壁的資料
    type_data: dict = {'Depth': 0, 
                       'Thickness': 0, 
                       'Protection': 0, 
                       'H_rebar':{'Retained_Side':{'Diameter':0, 'Spacing':0}, 'Extracted_Side':{'Diameter':0, 'Spacing':0}}, 
                       'V_rebar':{},
                       'S_rebar':{},
                       'Empty_depth': 0.0,
                       'Real_depth': 0.0}

    ''' 0. 一些預抓取資料'''
    # EXCAVATED SIDE
    pattern: str = r'\(EXCAVATED\s+SIDE\)'
    group['Extracted_Side'] = group['Text'].str.contains(pattern, regex=True)
    # RETAINED SIDE
    pattern: str = r'\(RETAINED\s+SIDE\)'
    group['Retained_Side'] = group['Text'].str.contains(pattern, regex=True)
    # ELEVATION
    pattern: str = r'^ELEVATION'
    group['Extracted_Elevation'] = group['Text'].str.contains(pattern, regex=True)
    # GL
    GL_pattern:str = r'GL\s+(\d+\.\d+)\s*%%P'
    group['Extracted_GL'] = group['Text'].str.extract(GL_pattern)
    group['Extracted_GL'] = group['Extracted_GL'].astype(float)
    GL: float = group['Extracted_GL'].dropna().iloc[0]
    # EL
    EL_pattern = r'^EL\s+(\d+\.\d+)'
    group['Extracted_EL'] = group['Text'].str.extract(EL_pattern)
    group['Extracted_EL'] = group['Extracted_EL'].astype(float)
    EL: float = group['Extracted_EL'].dropna().min()
    # int numbers
    int_pattern = r'^(\d+)$'
    group['Extracted_Int'] = group['Text'].str.extract(int_pattern)
    group['Extracted_Int'] = group['Extracted_Int'].astype(float)
    # 空打
    knockout_pattern: str = r'1000\s*%%P'
    contains_knockout: pd.Series = group['Text'].str.contains(knockout_pattern, regex=True)
    knockout_exists: bool = contains_knockout.any()
    # print(f"是否包含空打: {knockout_exists}")
    # 鋼筋
    rebar_pattern:str = r'^D(\d+)@(\d+)$'
    group['Extracted_Rebar_diameter'], group['Extracted_Rebar_spacing'] = zip(*group['Text'].str.extract(rebar_pattern).values)
    group['Extracted_Rebar_diameter'] = pd.to_numeric(group['Extracted_Rebar_diameter'], errors='coerce')
    group['Extracted_Rebar_spacing'] = pd.to_numeric(group['Extracted_Rebar_spacing'], errors='coerce')
    group['Extracted_Rebar_diameter'] = group['Extracted_Rebar_diameter'].astype('Int64')
    group['Extracted_Rebar_spacing'] = group['Extracted_Rebar_spacing'].astype('Int64')
    # print(group[group['Extracted_Rebar_diameter'].notna()]['Text'])

    
    ''' 1. 萃取型號 '''
    # 定義連續壁型號的正則表達式
    type_pattern: str = r'^DIAPHRAGM\s+WALL\s+TYPE\s+'
    group['Extracted_Types'] = group['Text'].str.contains(type_pattern, regex=True)
    type_rows: pd.DataFrame = group[group['Extracted_Types']==True].copy()
    type_rows['Types_list'] = type_rows['Text'].apply(extract_types)
    # 將提取的型號添加到列表中
    for i in range(len(type_rows)):
        type_name.extend(type_rows.iloc[i]['Types_list'])
        type_name: list = list(set(type_name))
    print(f"連續壁型號: {type_name}")

    ''' 2. 萃取厚度 '''
    # 獲取 X 最小的'EXCAVATED SIDE'列,回傳一個Series
    excavated_rows: pd.DataFrame = group[group['Extracted_Side']==True].copy()
    excavated_rows['X'] = excavated_rows['CentreCoor'].apply(parse_coordinate).apply(lambda x: x[0])
    excavated_rows['Y'] = excavated_rows['CentreCoor'].apply(parse_coordinate).apply(lambda y: y[1])
    excavated_rows = excavated_rows.sort_values('X')
    excavated_row: pd.Series = excavated_rows.iloc[0]
    most_left_excavated_x: float = excavated_row['X']
    most_left_excavated_y: float = excavated_row['Y']
    # print(f"最左邊的'EXCAVATED SIDE'列的X座標: {most_left_excavated_x}, Y座標: {most_left_excavated_y}")
    # 獲取 X 最小的'RETAINED SIDE'列,回傳一個Series
    retained_rows: pd.DataFrame = group[group['Retained_Side']==True].copy()
    retained_rows['X'] = retained_rows['CentreCoor'].apply(parse_coordinate).apply(lambda x: x[0])
    retained_rows['Y'] = retained_rows['CentreCoor'].apply(parse_coordinate).apply(lambda y: y[1])
    retained_rows = retained_rows.sort_values('X')
    retained_row = retained_rows.iloc[0]
    most_left_retained_x = retained_row['X']
    most_left_retained_y = retained_row['Y']
    # print(f"最左邊的'RETAINED SIDE'列的X座標: {most_left_retained_y}, Y座標: {most_left_retained_y}")
    # 尋找int number中 X 座標與 Y 座標介於兩者之間的列
    int_rows: pd.DataFrame = group.dropna(subset=['Extracted_Int']).copy()
    int_rows['X'] = int_rows['CentreCoor'].apply(parse_coordinate).apply(lambda x: x[0])
    int_rows['Y'] = int_rows['CentreCoor'].apply(parse_coordinate).apply(lambda y: y[1])
    int_rows = int_rows[(int_rows['X'] > most_left_excavated_x) & (int_rows['X'] < most_left_retained_x) & (int_rows['Y'] >= most_left_retained_y - 2) & (int_rows['Y'] <= most_left_excavated_y + 2)]
    int_row: pd.Series = int_rows.sort_values('X').iloc[0]
    type_data['Thickness'] = int_row['Extracted_Int']

    ''' 3. 萃取深度 '''
    if GL and EL:
        type_data['Depth'] = round((GL - EL), 2)

    ''' 4. 萃取保護層 '''
    protection_pattern: str = r'^(\d+)\s+CL'
    group['Extracted_Protection'] = group['Text'].str.extract(protection_pattern)
    group['Extracted_Protection'] = group['Extracted_Protection'].astype(float)
    type_data['Protection'] = group['Extracted_Protection'].dropna().iloc[0]

    ''' 5. 萃取水平鋼筋 '''
    # 尋找 X 最小的'ELEVATION'列,回傳一個Series
    elevation_rows: pd.DataFrame = group[group['Extracted_Elevation']==True].copy()
    elevation_rows['X'] = elevation_rows['CentreCoor'].apply(parse_coordinate).apply(lambda x: x[0])
    elevation_rows['Y'] = elevation_rows['CentreCoor'].apply(parse_coordinate).apply(lambda y: y[1])
    elevation_rows = elevation_rows.sort_values('X')
    elevation_row: pd.Series = elevation_rows.iloc[0]
    # print(f"最左邊的'ELEVATION'列的X座標: {elevation_row['X']}, Y座標: {elevation_row['Y']}")
    elevation_x: float = elevation_row['X']
    elevation_y: float = elevation_row['Y']
    # 尋找距離'ELEVATION'最近的'EXCAVATED SIDE'列
    nearest_excavated_index: int = find_nearest('Extracted_Side', elevation_x, elevation_y, excavated_rows)
    nearest_excavated_row: pd.Series = excavated_rows.iloc[nearest_excavated_index]
    nearest_excavated_x: float = nearest_excavated_row['X']
    nearest_excavated_y: float = nearest_excavated_row['Y']
    # print(f"距離'ELEVATION'最近的'EXCAVATED SIDE'列的X座標: {nearest_excavated_row['X']}, Y座標: {nearest_excavated_row['Y']}")
    # 尋找距離'ELEVATION'最近的'RETAINED SIDE'列
    nearest_retained_index: int = find_nearest('Retained_Side', elevation_x, elevation_y, retained_rows)
    nearest_retained_row: pd.Series = retained_rows.iloc[nearest_retained_index]
    nearest_retained_x: float = nearest_retained_row['X']
    nearest_retained_y: float = nearest_retained_row['Y']
    # print(f"距離'ELEVATION'最近的'RETAINED SIDE'列的X座標: {nearest_retained_row['X']}, Y座標: {nearest_retained_row['Y']}")
    # 尋找距離'excavated'最近的 'Extracted_Rebar_diameter'列
    rebar_rows: pd.DataFrame = group.dropna(subset=['Extracted_Rebar_diameter']).copy()
    nearest_excavated_rebar_index: int = find_nearest('Extracted_Rebar_diameter', nearest_excavated_x, nearest_excavated_y, rebar_rows)
    nearest_excavated_rebar_row: pd.Series = rebar_rows.iloc[nearest_excavated_rebar_index]
    # print(f"距離'excavated'最近的 'Extracted_Rebar_diameter'列的X座標: {nearest_excavated_rebar_row['CentreCoor']}, {nearest_excavated_rebar_row['Extracted_Rebar_diameter']}, {nearest_excavated_rebar_row['Extracted_Rebar_spacing']}")
    # 尋找距離'retained'最近的 'Extracted_Rebar_diameter'列
    nearest_retained_rebar_index: int = find_nearest('Extracted_Rebar_diameter', nearest_retained_x, nearest_retained_y, rebar_rows)
    nearest_retained_rebar_row: pd.Series = rebar_rows.iloc[nearest_retained_rebar_index]
    # print(f"距離'retained'最近的 'Extracted_Rebar_diameter'列的X座標: {nearest_retained_rebar_row['CentreCoor']}, {nearest_retained_rebar_row['Extracted_Rebar_diameter']}, {nearest_retained_rebar_row['Extracted_Rebar_spacing']}")
    type_data['H_rebar']['Retained_Side']['Diameter'] = nearest_retained_rebar_row['Extracted_Rebar_diameter']
    type_data['H_rebar']['Retained_Side']['Spacing'] = nearest_retained_rebar_row['Extracted_Rebar_spacing']
    type_data['H_rebar']['Extracted_Side']['Diameter'] = nearest_excavated_rebar_row['Extracted_Rebar_diameter']
    type_data['H_rebar']['Extracted_Side']['Spacing'] = nearest_excavated_rebar_row['Extracted_Rebar_spacing']

    ''' 6. 萃取垂直鋼筋、空打深度、實打深度 '''
    Depth: float = type_data['Depth']
    if Depth == 0:
        print(f'深度為0, 請檢查')
        # 彈跳錯誤
        continue

    V_rebar: dict = {}
    # 獲取 'Extracted_Int' 非 na 且 'RotationAngle' 非 0 的
    rotated_int_rows:pd.DataFrame = group.dropna(subset=['Extracted_Int']).copy()
    rotated_int_rows = rotated_int_rows[rotated_int_rows['RotationAngle'] != '0.0']
    rotated_int_rows['X'] = rotated_int_rows['CentreCoor'].apply(parse_coordinate).apply(lambda x: x[0])
    rotated_int_rows['Y'] = rotated_int_rows['CentreCoor'].apply(parse_coordinate).apply(lambda y: y[1])
    rotated_int_rows = rotated_int_rows.sort_values('X')
    min_rotated_int_row: pd.Series = rotated_int_rows.iloc[0]
    # 尋找所有 rotated_int_rows 中與 min_rotated_int_row 的 X 座標相差不超過1的行
    rotated_int_rows = rotated_int_rows[(rotated_int_rows['X'] >= min_rotated_int_row['X'] - 0.1) & (rotated_int_rows['X'] <= min_rotated_int_row['X'] + 0.1)]
    rotated_int_rows = rotated_int_rows.sort_values('Y')
    # 將 rotated_int_rows 的 'Extracted_Int' 轉換為 list
    rotated_int_list: list = rotated_int_rows['Extracted_Int'].tolist() # 順序為由深到淺

    # 判斷是否有空打
    if knockout_exists:
        # 獲取空打深度
        type_data['Empty_depth'] = Depth - (sum(rotated_int_list)/1000 + 1)
        # 獲取實打深度
        type_data['Real_depth'] = Depth - type_data['Empty_depth']
    print(f"空打深度: {type_data['Empty_depth']}, 實打深度: {type_data['Real_depth']}")

    # 獲取 'Extracted_Rebar_diameter' 非 na 且 'RotationAngle' 非 0 的
    rotated_rebar_rows: pd.DataFrame = group.dropna(subset=['Extracted_Rebar_diameter']).copy()
    rotated_rebar_rows = rotated_rebar_rows[rotated_rebar_rows['RotationAngle'] != '0.0']
    rotated_rebar_rows['X'] = rotated_rebar_rows['CentreCoor'].apply(parse_coordinate).apply(lambda x: x[0])
    rotated_rebar_rows['Y'] = rotated_rebar_rows['CentreCoor'].apply(parse_coordinate).apply(lambda y: y[1])
    # 獲取 rotated_rebar_rows 中與 int_row 的 X 座標相差不超過 0.5 的行
    rotated_rebar_rows = rotated_rebar_rows[(rotated_rebar_rows['X'] >= int_row['X'] - 0.5) & (rotated_rebar_rows['X'] <= int_row['X'] + 0.5)]
    rotated_rebar_rows = rotated_rebar_rows.sort_values('Y')
    print(rotated_int_list)
    print(rotated_rebar_rows[['Text','X']])
    # 依序獲取垂直鋼筋
    for i in range(len(rotated_rebar_rows)):
        V_rebar[i] = {'Diameter': rotated_rebar_rows.iloc[i]['Extracted_Rebar_diameter'], 'Spacing': rotated_rebar_rows.iloc[i]['Extracted_Rebar_spacing']}
    # 獲取 num_
    type_data['V_rebar'] = V_rebar

    ''' 7. 萃取剪力鋼筋 '''

    print(type_data)
    



組名: Q881LG10SE1470.dwg
連續壁型號: ['T1']
空打深度: 0.0, 實打深度: 0.0
[7500.0, 17000.0, 7000.0]
        Text          X
104  D13@300  312483.61
105  D13@300  312483.59
106  D13@300  312483.60
{'Depth': 31.5, 'Thickness': 1000.0, 'Protection': 75.0, 'H_rebar': {'Retained_Side': {'Diameter': 19, 'Spacing': 300}, 'Extracted_Side': {'Diameter': 19, 'Spacing': 300}}, 'V_rebar': {0: {'Diameter': 13, 'Spacing': 300}, 1: {'Diameter': 13, 'Spacing': 300}, 2: {'Diameter': 13, 'Spacing': 300}}, 'S_rebar': {}, 'Empty_depth': 0.0, 'Real_depth': 0.0}
組名: Q881LG10SE1480.dwg
連續壁型號: ['T1A']
空打深度: 17.8, 實打深度: 13.7
[7500.0, 5200.0]
        Text          X
392  D13@300  312487.14
393  D13@300  312487.11
{'Depth': 31.5, 'Thickness': 1000.0, 'Protection': 75.0, 'H_rebar': {'Retained_Side': {'Diameter': 19, 'Spacing': 300}, 'Extracted_Side': {'Diameter': 19, 'Spacing': 300}}, 'V_rebar': {0: {'Diameter': 13, 'Spacing': 300}, 1: {'Diameter': 13, 'Spacing': 300}}, 'S_rebar': {}, 'Empty_depth': 17.8, 'Real_depth': 13.7}
組名:

In [5]:
df_test = df.copy()
EL_pattern = r'^EL\s+(\d+\.\d+)'
df_test['Extracted_EL'] = df_test['Text'].str.extract(EL_pattern)
df_test['Extracted_EL'] = df_test['Extracted_EL'].astype(float)
print(df_test['Extracted_EL'].dropna())

113      90.38
114      84.88
173      95.88
174      77.55
179     101.78
         ...  
3598     69.10
3644     54.10
3648     54.10
3653     89.10
3654     61.10
Name: Extracted_EL, Length: 148, dtype: float64
