In [14]:
import json
import re
from collections import OrderedDict

# 載入 JSON 檔案（逐行讀取 JSON 物件）
def load_json(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if not line:
                continue  # 跳過空行
            try:
                json_obj = json.loads(line)
                data.append(json_obj)
            except json.JSONDecodeError as e:
                print(f"JSONDecodeError in line: {line}. Error: {e}")
    return data

# 數據清理的主函數
def clean_data(data):
    cleaned_data = []
    for item in data:
        cleaned_item = OrderedDict()
        # 清理網頁 URL
        cleaned_item['WEB'] = item.get('WEB', '').strip()
        
        # 清理價格欄位，轉為 float 類型，並確保有兩位小數
        try:
            cleaned_item['B2C_LOW_PRICE'] = round(float(item.get('B2C_LOW_PRICE', 0)), 2)
        except ValueError:
            cleaned_item['B2C_LOW_PRICE'] = 0.0
        
        # 清理產品描述，去除多餘空格、標點符號、重複內容等
        cleaned_item['PROD_DESC'] = clean_text(item.get('PROD_DESC', ''))
        
        # 清理產品名稱，確保首字母大寫，去除多餘空格
        cleaned_item['PROD_NAME'] = item.get('PROD_NAME', '').strip().title()
        
        # 清理其他可用字段並去除多餘空格
        cleaned_item['PROVIDER'] = item.get('PROVIDER', '').strip()
        try:
            cleaned_item['SHIP_PRICE'] = round(float(item.get('SHIP_PRICE', 0)), 2)
        except ValueError:
            cleaned_item['SHIP_PRICE'] = 0.0
        
        # 將清理後的資料加入結果清單
        cleaned_data.append(cleaned_item)
    return cleaned_data

# 文字清理函數：去除特殊符號、多餘空格、重複的文字片段等
def clean_text(text):
    # 去除重複段落（可按具體需求定制）
    text = re.sub(r'(\\b\\w+\\b)(\\s+\\1\\b)+', r'\\1', text)
    # 去除多餘的空格
    text = ' '.join(text.split())
    return text.strip()

# 儲存清理後的 JSON
def save_json(data, output_path):
    with open(output_path, 'w', encoding='utf-8') as file:
        json.dump(data, file, indent=4, ensure_ascii=False)

# 主程式執行流程
def main():
    input_file = './etkt-20240919(1).json'  # 請換成您的 JSON 檔案路徑
    output_file = './cleaned_data.json'
    
    data = load_json(input_file)
    if data:  # 確保資料正確載入
        cleaned_data = clean_data(data)
        save_json(cleaned_data, output_file)
        print(f'數據清理完成，已儲存至 {output_file}')
    else:
        print('無法載入資料，請檢查 JSON 檔案格式。')

if __name__ == "__main__":
    main()


數據清理完成，已儲存至 ./cleaned_data.json


In [15]:
import json
import re
from collections import OrderedDict

# 載入 JSON 檔案（逐行讀取 JSON 物件）
def load_json(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if not line:
                continue  # 跳過空行
            try:
                json_obj = json.loads(line)
                data.append(json_obj)
            except json.JSONDecodeError as e:
                print(f"JSONDecodeError in line: {line}. Error: {e}")
    return data

# 數據清理的主函數
def clean_data(data):
    cleaned_data = []
    for item in data:
        cleaned_item = OrderedDict()
        # 清理網頁 URL
        cleaned_item['WEB'] = item.get('WEB', '').strip()
        
        # 清理價格欄位，轉為 float 類型，並確保有兩位小數
        try:
            cleaned_item['B2C_LOW_PRICE'] = round(float(item.get('B2C_LOW_PRICE', 0)), 2)
        except ValueError:
            cleaned_item['B2C_LOW_PRICE'] = 0.0
        
        # 清理產品描述，去除多餘空格、標點符號、重複內容等
        cleaned_item['PROD_DESC'] = clean_text(item.get('PROD_DESC', ''))
        
        # 清理產品名稱，確保首字母大寫，去除多餘空格
        cleaned_item['PROD_NAME'] = item.get('PROD_NAME', '').strip().title()
        
        # 清理其他可用字段並去除多餘空格
        cleaned_item['PROVIDER'] = item.get('PROVIDER', '').strip()
        try:
            cleaned_item['SHIP_PRICE'] = round(float(item.get('SHIP_PRICE', 0)), 2)
        except ValueError:
            cleaned_item['SHIP_PRICE'] = 0.0
        
        # 將清理後的資料加入結果清單
        cleaned_data.append(cleaned_item)
    return cleaned_data

# 文字清理函數：去除特殊符號、多餘空格、重複的文字片段等
def clean_text(text):
    # 去除重複段落（可按具體需求定制）
    text = re.sub(r'(\b\w+\b)(\s+\1\b)+', r'\1', text)
    # 去除多餘的空格
    text = ' '.join(text.split())
    return text.strip()

# 儲存清理後的 JSON
def save_json(data, output_path):
    with open(output_path, 'w', encoding='utf-8') as file:
        json.dump(data, file, indent=4, ensure_ascii=False)

# 主程式執行流程
def main():
    input_file = './etkt-20240919(1).json'  # 請換成您的 JSON 檔案路徑
    output_file = './cleaned_data.json'
    
    data = load_json(input_file)
    if data:  # 確保資料正確載入
        cleaned_data = clean_data(data)
        save_json(cleaned_data, output_file)
        print(f'數據清理完成，已儲存至 {output_file}')
        print(f'未清理資料筆數: {len(data)}，清理後資料筆數: {len(cleaned_data)}')
    else:
        print('無法載入資料，請檢查 JSON 檔案格式。')

if __name__ == "__main__":
    main()

數據清理完成，已儲存至 ./cleaned_data.json
未清理資料筆數: 1039，清理後資料筆數: 1039
