In [5]:
import os
import pandas as pd
import chardet
import re
import numpy as np

# 1️⃣ 获取当前 Notebook 所在目录
current_dir = os.getcwd()

# 2️⃣ 获取 `Projekt` 根目录（即 `raw_data` 的上一级目录）
projekt_root = os.path.abspath(os.path.join(current_dir, "..", ".."))

# 3️⃣ 定义 `cleaned_data` 目录的正确路径
cleaned_data_dir = os.path.join(projekt_root, "cleaned_data")

# 4️⃣ 确保 `cleaned_data` 目录存在
os.makedirs(cleaned_data_dir, exist_ok=True)

# 5️⃣ 获取当前目录下所有 CSV 文件
csv_files = [f for f in os.listdir() if f.endswith(".csv")]

# 6️⃣ 定义薪资清理函数
def clean_salary(salary):
    if pd.isna(salary) or salary == "" or salary == "Not Provided":
        return "unknown", "unknown", "unknown", "unknown"

    # 移除 $ 和 , 只保留数字、"." 和 "-"（适用于范围）
    salary = re.sub(r"[\$,]", "", str(salary))

    # 提取数值范围（如 "80,000 - 110,000" → [80000, 110000]）
    salary_range = re.findall(r"\d+\.\d+|\d+", salary)

    # 解析薪资范围
    if len(salary_range) == 2:
        low_salary = float(salary_range[0])
        high_salary = float(salary_range[1])
    elif len(salary_range) == 1:
        low_salary = high_salary = float(salary_range[0])
    else:
        return "unknown", "unknown", "unknown", "unknown"

    # 识别薪资单位并转换
    if "por año" in salary.lower():  # 年薪
        return low_salary, high_salary, (low_salary + high_salary) / 2, "year"
    elif "por hora" in salary.lower():  # 时薪转换为年薪（假设 2080 工作小时/年）
        return low_salary * 2080, high_salary * 2080, ((low_salary + high_salary) / 2) * 2080, "hour"
    elif "por mes" in salary.lower():  # 月薪转换为年薪
        return low_salary * 12, high_salary * 12, ((low_salary + high_salary) / 2) * 12, "month"
    else:
        return "unknown", "unknown", "unknown", "unknown"

# 7️⃣ 处理每个 CSV 文件
for file in csv_files:
    print(f"📂 正在处理文件: {file}")

    # 读取文件编码
    with open(file, "rb") as f:
        raw_data = f.read(10000)  # 读取前 10000 字节检测编码
        detected_encoding = chardet.detect(raw_data)["encoding"]

    print(f"📌 检测到编码: {detected_encoding}")

    # 8️⃣ 使用检测到的编码格式进行读取
    try:
        df = pd.read_csv(file, encoding=detected_encoding)
        print(f"✅ 成功使用 {detected_encoding} 读取 {file}")
    except Exception as e:
        print(f"❌ 读取 {file} 失败，使用 {detected_encoding}，尝试其他编码...")

        # 依次尝试 utf-8、ISO-8859-1、Windows-1252
        encoding_attempts = ["utf-8", "ISO-8859-1", "Windows-1252"]
        success = False

        for enc in encoding_attempts:
            try:
                df = pd.read_csv(file, encoding=enc)
                print(f"✅ 成功使用 {enc} 读取 {file}")
                success = True
                break  # 成功读取，跳出循环
            except Exception as e:
                print(f"❌ 读取 {file} 失败，尝试 {enc} 编码: {e}")

        if not success:
            print(f"⛔ 文件 {file} 无法读取，跳过处理。\n")
            continue  # 跳过无法读取的文件

    # 9️⃣ 识别数据集中的薪资列（自动匹配列名大小写）
    def find_salary_columns(df):
        salary_keywords = ["salary", "low_salary", "high_salary", "mean_salary"]
        found_cols = {col.lower(): col for col in df.columns if any(keyword in col.lower() for keyword in salary_keywords)}
        return found_cols

    salary_col_map = find_salary_columns(df)

    # 🔟 处理 `Salary` 数据
    if "salary" in salary_col_map:
        salary_col = salary_col_map["salary"]
        print(f"📌 发现薪资列: {salary_col}")

        # 计算薪资数据并创建新列
        df[["Low_Salary", "High_Salary", "Mean_Salary", "Salary_Frequency"]] = df[salary_col].apply(lambda x: pd.Series(clean_salary(x)))

    else:
        print(f"⚠️ 未找到薪资列，跳过薪资处理。")

    # 1️⃣1️⃣ 处理缺失值（填充 "unknown"）
    missing_cols = ["Low_Salary", "High_Salary", "Mean_Salary", "Salary_Frequency"]
    for col in missing_cols:
        if col not in df.columns:
            df[col] = "unknown"

    df.fillna("unknown", inplace=True)

    # 1️⃣2️⃣ 确保 `NaN` 转换为 `"unknown"`
    for col in missing_cols:
        df[col] = df[col].apply(lambda x: "unknown" if pd.isna(x) or x == "" else x)

    # 1️⃣3️⃣ 生成清理后文件名
    cleaned_filename = f"cleaned_{file}"
    cleaned_filepath = os.path.join(cleaned_data_dir, cleaned_filename)

    # 1️⃣4️⃣ 保存清理后的 CSV 文件
    df.to_csv(cleaned_filepath, index=False, encoding="utf-8")

    print(f"✅ 清理完成，已保存至: {cleaned_filepath}\n")

# 1️⃣5️⃣ 处理完成提示
print(f"🎉 所有 CSV 文件已处理完毕，清理后的文件存放在 `{cleaned_data_dir}` 文件夹中！")


📂 正在处理文件: global_jobs_salaries_2024.csv
📌 检测到编码: ascii
❌ 读取 global_jobs_salaries_2024.csv 失败，使用 ascii，尝试其他编码...
✅ 成功使用 utf-8 读取 global_jobs_salaries_2024.csv
📌 发现薪资列: Salary
✅ 清理完成，已保存至: G:\Nextcloud\FSU_Cloud\Big Data\Projekt\cleaned_data\cleaned_global_jobs_salaries_2024.csv

🎉 所有 CSV 文件已处理完毕，清理后的文件存放在 `G:\Nextcloud\FSU_Cloud\Big Data\Projekt\cleaned_data` 文件夹中！
