### 資料整理

In [5]:
import json
import re
from collections import OrderedDict

# 載入 JSON 檔案（逐行讀取 JSON 物件）
def load_json(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if not line:
                continue  # 跳過空行
            try:
                json_obj = json.loads(line)
                data.append(json_obj)
            except json.JSONDecodeError as e:
                print(f"JSONDecodeError in line: {line}. Error: {e}")
    return data

# 數據清理的主函數
def clean_data(data):
    cleaned_data = []
    for item in data:
        cleaned_item = OrderedDict()
        # 清理網頁 URL
        cleaned_item['WEB'] = item.get('WEB', '').strip()
        
        # 清理價格欄位，轉為 float 類型，並確保有兩位小數
        try:
            cleaned_item['B2C_LOW_PRICE'] = round(float(item.get('B2C_LOW_PRICE', 0)), 2)
        except ValueError:
            cleaned_item['B2C_LOW_PRICE'] = 0.0
        
        # 清理產品描述，去除多餘空格、標點符號、重複內容等
        cleaned_item['PROD_DESC'] = clean_text(item.get('PROD_DESC', ''))
        
        # 清理產品名稱，確保首字母大寫，去除多餘空格
        cleaned_item['PROD_NAME'] = item.get('PROD_NAME', '').strip().title()
        
        # 清理其他可用字段並去除多餘空格
        cleaned_item['PROVIDER'] = item.get('PROVIDER', '').strip()
        try:
            cleaned_item['SHIP_PRICE'] = round(float(item.get('SHIP_PRICE', 0)), 2)
        except ValueError:
            cleaned_item['SHIP_PRICE'] = 0.0
        
        # 將清理後的資料加入結果清單
        cleaned_data.append(cleaned_item)
    return cleaned_data

# 文字清理函數：去除特殊符號、多餘空格、重複的文字片段等
def clean_text(text):
    # 去除重複段落（可按具體需求定制）
    text = re.sub(r'(\b\w+\b)(\s+\1\b)+', r'\1', text)
    # 去除多餘的空格
    text = ' '.join(text.split())
    return text.strip()

# 儲存清理後的 JSON
def save_json(data, output_path):
    with open(output_path, 'w', encoding='utf-8') as file:
        json.dump(data, file, indent=4, ensure_ascii=False)

# 主程式執行流程
def main():
    input_file = './etkt-20240919(1).json'  # 請換成您的 JSON 檔案路徑
    output_file = './cleaned_data.json'
    
    data = load_json(input_file)
    if data:  # 確保資料正確載入
        cleaned_data = clean_data(data)
        save_json(cleaned_data, output_file)
        print(f'數據清理完成，已儲存至 {output_file}')
        print(f'未清理資料筆數: {len(data)}，清理後資料筆數: {len(cleaned_data)}')
    else:
        print('無法載入資料，請檢查 JSON 檔案格式。')

if __name__ == "__main__":
    main()

JSONDecodeError in line: {"WEB": "https://travel.liontravel.com/detail?NormGroupID=cad20e77-1ca1-42db-a22f-15cf5be9823f&mtl=feebee&utm_source=feebee&utm_medium=referral&utm_campaign=tour", "B2C_LOW_PRICE": 19800.0, "PROD_DESC": "※報價不含：機票、門票、浮"https://travel.liontravel.com/detail?NormGroupID=751ef512-6914-45fd-b060-a8acef3c96c7&mtl=feebee&utm_source=feebee&utm_medium=referral&utm_campaign=tour", "B2C_LOW_PRICE": 25499.0, "PROD_DESC": ":snowman::snowman::snowman: 慢遊首爾滑雪半自助好好玩 :snowman::snowman::snowman::eight-spoked_asterisk::eight-spoked_asterisk: 一日自由活動｜不受拘束．無時間壓力．想逛哪裡就逛哪裡！ :eight-spoked_asterisk::eight-spoked_asterisk::eight-spoked_asterisk::eight-spoked_asterisk: 安排入住近明洞飯店兩晚｜充份的自由逛街時間．想買什麼就買什麼！ :eight-spoked_asterisk::eight-spoked_asterisk::eight-spoked_asterisk::eight-spoked_asterisk: 冬季限定滑雪體驗｜銀白雪世界．浪漫追雪趣．滑雪會上癮！ :eight-spoked_asterisk::eight-spoked_asterisk::eight-spoked_asterisk::eight-spoked_asterisk: 結合當地文化與美食｜讓旅程更加豐富多彩！ :eight-spoked_asterisk::eight-spoked_asterisk:冬季限定滑雪包含雪具：雪鞋

### 資料清洗

#去除重複項目

In [6]:
import json

# 讀取JSON檔案
with open('./cleaned_data.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

# 顯示原始資料筆數
original_count = len(data)
print(f"原始資料筆數: {original_count}")

# 使用set來去除重複項，並保持資料結構
seen = set()
cleaned_data = []
for item in data:
    # 將每個項目轉換為字串進行比較，因為set無法直接處理字典
    item_tuple = tuple(sorted(item.items()))
    if item_tuple not in seen:
        seen.add(item_tuple)
        cleaned_data.append(item)

# 顯示清理後的資料筆數
cleaned_count = len(cleaned_data)
print(f"清理後資料筆數: {cleaned_count}")
print(f"刪除了 {original_count - cleaned_count} 筆重複資料")

# 將清理後的資料寫回新的JSON檔案
with open('./cleaned_data 2.json', 'w', encoding='utf-8') as file:
    json.dump(cleaned_data, file, ensure_ascii=False, indent=4)

print("重複項目已去除，清理後的資料已存檔。")

原始資料筆數: 4189
清理後資料筆數: 4100
刪除了 89 筆重複資料
重複項目已去除，清理後的資料已存檔。


#刪除缺失值

In [7]:
import json

# 讀取JSON檔案
with open('./cleaned_data 2.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

# 遍歷每個物件，進行條件檢查和刪除
for item in data:
    if item.get("PROVIDER") == "" and item.get("SHIP_PRICE") == 0.0:
        # 如果 PROVIDER 為空字串，SHIP_PRICE 為0.0，則刪除這兩個鍵
        del item["PROVIDER"]
        del item["SHIP_PRICE"]

# 顯示處理後的資料筆數
print(f"處理後資料筆數: {len(data)}")

# 將處理後的資料寫回新的JSON檔案
with open('./cleaned_data 3.json', 'w', encoding='utf-8') as file:
    json.dump(data, file, ensure_ascii=False, indent=4)

print("已根據條件清理資料並儲存至新檔案。")


處理後資料筆數: 4100
已根據條件清理資料並儲存至新檔案。
