In [14]:
import csv
import time
import re
import numpy as np
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

# ChromeDriver 경로 지정
driver_path = "./chromedriver-win64/chromedriver.exe" 
# Chrome WebDriver 실행
driver = webdriver.Chrome(executable_path=driver_path)

print("작업 완료")

작업 완료


In [21]:
def fetch_insurance_history(driver, vehicle_ids, output_file):
    # ID에 부합하는 보험이력 정보 저장
    '''
    id : Vehicle ID (매물 ID)
    n : Car Number (자동차 번호)
    nc : Number Change History (번호 변경 이력)
    oc : Owner Change History (소유자 변경 이력)
    tl : Total Loss History (전손 사고 이력)
    fd : Flood Damage History (침수 사고 이력)
    tf : Theft History (도난 사고 이력)
    cm : Insurance Claim of My Car (내 차 피해 이력)
    co : Insurance Claim of Other Car (타 차 가해 이력)
    '''
    fieldnames = ["id", "n", "nc", "oc","tl", "fd", "tf","cm", "co"]
    data = []
    count = 1
    for vid in vehicle_ids:
        try:
            driver.get(f"https://fem.encar.com/cars/report/accident/{vid}")
            details = {
                "id": vid,
                "n": WebDriverWait(driver, 1).until(
                    EC.presence_of_element_located((By.XPATH, "//*[@id='wrap']/div/div[2]/div[2]/dl/dd[1]"))
                ).text,
                "nc": driver.find_element(By.XPATH, "//*[@id='wrap']/div/div[2]/div[2]/div[1]/ul/li[3]/span/span[1]").text,
                "oc": driver.find_element(By.XPATH, "//*[@id='wrap']/div/div[2]/div[2]/div[1]/ul/li[3]/span/span[2]").text,
                "tl": driver.find_element(By.XPATH, "//*[@id='wrap']/div/div[2]/div[2]/div[1]/ul/li[4]/span/span[1]").text,
                "fd": driver.find_element(By.XPATH, "//*[@id='wrap']/div/div[2]/div[2]/div[1]/ul/li[4]/span/span[2]").text,
                "tf": driver.find_element(By.XPATH, "//*[@id='wrap']/div/div[2]/div[2]/div[1]/ul/li[4]/span/span[3]").text,
                "cm": driver.find_element(By.XPATH, "//*[@id='wrap']/div/div[2]/div[2]/div[1]/ul/li[5]/span").text,
                "co": driver.find_element(By.XPATH, "//*[@id='wrap']/div/div[2]/div[2]/div[1]/ul/li[6]/span").text
            }
            data.append(details)
            count += 1

            if(count > 100) : 
                print(f"{count}개 보험 데이터 추출 완료")
                break;
            
        except Exception:
            print(f"ID '{vid}': 판매자가 보험이력을 공개하지 않았습니다.")
            
    with open(output_file, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(data)

def merge_csv_files(files, output_file):
    # csv파일 최종적으로 하나로 병합
    combined_df = pd.concat([pd.read_csv(file) for file in files], ignore_index=True)
    combined_df.to_csv(output_file, index=False, encoding="utf-8")
    print(f"'{output_file}'파일에 병합되어 저장되었습니다.")


def main():
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # 브라우저 창을 띄우지 않음
    
    # ChromeDriver 경로 지정
    driver_path = "./chromedriver-win64/chromedriver.exe" 
    
    # Chrome WebDriver 실행
    driver = webdriver.Chrome(executable_path=driver_path, options=options)

    max_item = 1000 
    
    # 국산차 브랜드별 필터링 리스트
    urls = {
        # "HYUNDAI": "https://car.encar.com/list/car?page=1&search=%7B%22type%22%3A%22car%22%2C%22action%22%3A%22(And.Hidden.N._.(C.CarType.Y._.Manufacturer.%ED%98%84%EB%8C%80.))%22%2C%22title%22%3A%22%ED%98%84%EB%8C%80%22%2C%22toggle%22%3A%7B%7D%2C%22layer%22%3A%22%22%2C%22sort%22%3A%22MobileModifiedDate%22%7D",
        # "GENESIS" : "https://car.encar.com/list/car?page=1&search=%7B%22type%22%3A%22car%22%2C%22action%22%3A%22(And.Hidden.N._.(C.CarType.Y._.Manufacturer.%EC%A0%9C%EB%84%A4%EC%8B%9C%EC%8A%A4.))%22%2C%22title%22%3A%22%EC%A0%9C%EB%84%A4%EC%8B%9C%EC%8A%A4%22%2C%22toggle%22%3A%7B%7D%2C%22layer%22%3A%22%22%2C%22sort%22%3A%22MobileModifiedDate%22%7D",
        # "KIA": "https://car.encar.com/list/car?page=1&search=%7B%22type%22%3A%22car%22%2C%22action%22%3A%22(And.Hidden.N._.(C.CarType.Y._.Manufacturer.%EA%B8%B0%EC%95%84.))%22%2C%22title%22%3A%22%EA%B8%B0%EC%95%84%22%2C%22toggle%22%3A%7B%7D%2C%22layer%22%3A%22%22%2C%22sort%22%3A%22MobileModifiedDate%22%7D",
        # "CHEVROLET": "https://car.encar.com/list/car?page=1&search=%7B%22type%22%3A%22car%22%2C%22action%22%3A%22(And.Hidden.N._.(C.CarType.Y._.Manufacturer.%EC%89%90%EB%B3%B4%EB%A0%88(GM%EB%8C%80%EC%9A%B0_).))%22%2C%22title%22%3A%22%EC%89%90%EB%B3%B4%EB%A0%88(GM%EB%8C%80%EC%9A%B0)%22%2C%22toggle%22%3A%7B%7D%2C%22layer%22%3A%22%22%2C%22sort%22%3A%22MobileModifiedDate%22%7D",
        # "RENAULT": "https://car.encar.com/list/car?page=1&search=%7B%22type%22%3A%22car%22%2C%22action%22%3A%22(And.Hidden.N._.(C.CarType.Y._.Manufacturer.%EB%A5%B4%EB%85%B8%EC%BD%94%EB%A6%AC%EC%95%84(%EC%82%BC%EC%84%B1_).))%22%2C%22title%22%3A%22%EB%A5%B4%EB%85%B8%EC%BD%94%EB%A6%AC%EC%95%84(%EC%82%BC%EC%84%B1)%22%2C%22toggle%22%3A%7B%7D%2C%22layer%22%3A%22%22%2C%22sort%22%3A%22MobileModifiedDate%22%7D",
        # "KGMOBILITY": "https://car.encar.com/list/car?page=1&search=%7B%22type%22%3A%22car%22%2C%22action%22%3A%22(And.Hidden.N._.(C.CarType.Y._.Manufacturer.KG%EB%AA%A8%EB%B9%8C%EB%A6%AC%ED%8B%B0(%EC%8C%8D%EC%9A%A9_).))%22%2C%22title%22%3A%22KG%EB%AA%A8%EB%B9%8C%EB%A6%AC%ED%8B%B0(%EC%8C%8D%EC%9A%A9)%22%2C%22toggle%22%3A%7B%7D%2C%22layer%22%3A%22%22%2C%22sort%22%3A%22MobileModifiedDate%22%7D"
        "ALL":"https://car.encar.com/list/car?page={page}&search=%7B%22action%22%3A%22(And.Hidden.N._.CarType.Y.)%22%2C%22sort%22%3A%22MobileModifiedDate%22%7D"
    }

    

    try:
        id_url = "https://raw.githubusercontent.com/MJU-BDP-2024/TermProject/develop/export/vehicles_id_price.csv"
        df = pd.read_csv(id_url)
        id_list = df['id'].tolist()
        csv_files = []
        for brand, url in urls.items():
            output_file = f"insurance_{brand}.csv"
            fetch_insurance_history(driver, id_list, output_file)
            csv_files.append(output_file)
            print(f"===========================================================")
        merge_csv_files(csv_files, "insurance_all.csv")

        # 병합된 CSV 파일 읽기
        final_output_file = "insurance_all.csv"
        df = pd.read_csv(final_output_file)

        # 숫자 추출 함수 정의
        def extract_number(value):
            # 숫자만 추출
            import re
            match = re.search(r'\d+', str(value))
            return int(match.group()) if match else 0

        def process_cm(value):
            if value == "없음":
                return "0/0"
            elif "미확정" in value:
                match = re.search(r'(\d+)회', value)
                if match:
                    count = match.group(1)
                    return f"{count}/None"
                else:
                    return "None/None"
            else:
                match = re.search(r'(\d+)회[^\d]*(\d+)', value.replace(",", ""))
                if match:
                    count = match.group(1)
                    amount = match.group(2)
                    return f"{count}/{amount}"
                else:
                    return "None/None"

        # 'cm' 컬럼 변환 적용
        df['cm'] = df['cm'].apply(process_cm)

        # '/' 기준으로 컬럼 나누기
        df[['cm_num', 'cm_price']] = df['cm'].str.split('/', expand=True)
        
        # 기존 'cm' 컬럼 제거
        df.drop(columns=['cm'], inplace=True)

        # 'cm_price' 컬럼에서 None 값을 NaN으로 변환 (숫자 처리 가능하도록)
        df['cm_price'] = df['cm_price'].replace("None", np.nan)  # pd.NA 대신 np.nan 사용
        
        # 평균값 계산 (NaN 값 제외)
        average_price = df['cm_price'].astype(float).mean()  # float 변환 후 평균값 계산
        
        # 평균값을 반올림
        average_price = round(average_price)
        
        # None (NaN) 값을 반올림된 평균값으로 대체
        df['cm_price'] = df['cm_price'].astype(float).fillna(average_price)

        print(f"대체 값 : {average_price}")

        # 'cm_price' 컬럼을 정수로 변환 (소수점 제거)
        df['cm_price'] = df['cm_price'].round().astype(int)

        print("==================== co 값 정제 =====================")

        # 'co' 컬럼 처리 함수
        def process_co(value):
            if value == "없음":
                return "0/0"
            elif "미확정" in value:
                match = re.search(r'(\d+)회', value)
                if match:
                    count = match.group(1)
                    return f"{count}/None"
                else:
                    return "None/None"
            else:
                match = re.search(r'(\d+)회[^\d]*(\d+)', value.replace(",", ""))
                if match:
                    count = match.group(1)
                    amount = match.group(2)
                    return f"{count}/{amount}"
                else:
                    return "None/None"
        
        # 'co' 컬럼 정제
        df['co'] = df['co'].apply(process_co)
        
        # 'co' 컬럼을 'co_num'과 'co_price'로 분리
        df[['co_num', 'co_price']] = df['co'].str.split('/', expand=True)
        
        # 기존 'co' 컬럼 삭제
        df.drop(columns=['co'], inplace=True)

        # 'co_price' 컬럼에서 None 값을 NaN으로 변환
        df['co_price'] = df['co_price'].replace('None', np.nan).astype(float)
        
        # 'co_price'의 평균값 계산
        co_price_mean = round(df['co_price'].mean())
        
        # 평균값으로 NaN 대체
        df['co_price'].fillna(co_price_mean, inplace=True)

        print(f"대체값 : {co_price_mean}")

        df['co_price'] = df['co_price'].astype(int)
        # 변환할 일반 숫자 추출 컬럼
        columns_to_convert = ["nc", "oc", "tl", "tf", "fd"]

        # 일반 숫자 추출 수행
        for col in columns_to_convert:
            df[col] = df[col].apply(extract_number)

        # 변환된 결과를 다시 CSV 파일로 저장
        df.to_csv(final_output_file, index=False, encoding="utf-8")
        print(f"숫자 형식으로 변환된 결과가 '{final_output_file}'에 저장되었습니다.")
        
    finally:
        driver.quit()

if __name__ == "__main__":
    main()


ID '37436440': 판매자가 보험이력을 공개하지 않았습니다.
ID '38296232': 판매자가 보험이력을 공개하지 않았습니다.
101개 보험 데이터 추출 완료
'insurance_all.csv'파일에 병합되어 저장되었습니다.
대체 값 : 1458864
대체값 : 768580
숫자 형식으로 변환된 결과가 'insurance_all.csv'에 저장되었습니다.
