In [None]:
import os
import pandas as pd
import chardet
import re
import numpy as np


current_dir = os.getcwd()


projekt_root = os.path.abspath(os.path.join(current_dir, "..", ".."))


cleaned_data_dir = os.path.join(projekt_root, "cleaned_data")


os.makedirs(cleaned_data_dir, exist_ok=True)


csv_files = [f for f in os.listdir() if f.endswith(".csv")]


def clean_salary(salary):
    """
    只清理无效的薪资数据，如果是浮点数则直接返回，不进行任何转换。
    """
    if isinstance(salary, (int, float)) and not np.isnan(salary):
        return salary 

    if pd.isna(salary) or salary == "" or str(salary).strip().lower() in ["not provided", "unknown", "n/a"]:
        return "unknown" 


    salary = str(salary)
    salary = re.sub(r"[\$,]", "", salary)  


    salary_range = re.findall(r"\d+\.\d+|\d+", salary)

    if len(salary_range) == 2:
        low_salary = float(salary_range[0])
        high_salary = float(salary_range[1])
        mean_salary = (low_salary + high_salary) / 2
    elif len(salary_range) == 1:
        mean_salary = float(salary_range[0])
    else:
        return "unknown"  

    return mean_salary  


for file in csv_files:
    print(f" 正在处理文件: {file}")


    with open(file, "rb") as f:
        raw_data = f.read(10000)  
        detected_encoding = chardet.detect(raw_data)["encoding"]

    print(f" 检测到编码: {detected_encoding}")


    try:
        df = pd.read_csv(file, encoding=detected_encoding)
        print(f"成功使用 {detected_encoding} 读取 {file}")
    except Exception as e:
        print(f"读取 {file} 失败，使用 {detected_encoding}，尝试其他编码...")

 
        encoding_attempts = ["utf-8", "ISO-8859-1", "Windows-1252"]
        success = False

        for enc in encoding_attempts:
            try:
                df = pd.read_csv(file, encoding=enc)
                print(f" 成功使用 {enc} 读取 {file}")
                success = True
                break  # 成功读取，跳出循环
            except Exception as e:
                print(f" 读取 {file} 失败，尝试 {enc} 编码: {e}")

        if not success:
            print(f" 文件 {file} 无法读取，跳过处理。\n")
            continue  # 跳过无法读取的文件


    def find_salary_column(df):
        salary_keywords = ["salary", "salaries", "base salary"]
        for col in df.columns:
            if any(keyword in col.lower() for keyword in salary_keywords):
                return col
        return None

    salary_col = find_salary_column(df)


    if salary_col:
        print(f" 发现薪资列: {salary_col}")


        df[salary_col] = df[salary_col].apply(clean_salary)

    else:
        print(f" 未找到薪资列，跳过薪资处理。")


    cleaned_filename = f"cleaned_{file}"
    cleaned_filepath = os.path.join(cleaned_data_dir, cleaned_filename)


    df.to_csv(cleaned_filepath, index=False, encoding="utf-8")

    print(f" 清理完成，已保存至: {cleaned_filepath}\n")


print(f" 所有 CSV 文件已处理完毕，清理后的文件存放在 `{cleaned_data_dir}` 文件夹中！")


📂 正在处理文件: global-salaries-in-ai-ml-data-science.csv
📌 检测到编码: ascii
✅ 成功使用 ascii 读取 global-salaries-in-ai-ml-data-science.csv
📌 发现薪资列: salary
✅ 清理完成，已保存至: G:\Nextcloud\FSU_Cloud\Big Data\Projekt\cleaned_data\cleaned_global-salaries-in-ai-ml-data-science.csv

🎉 所有 CSV 文件已处理完毕，清理后的文件存放在 `G:\Nextcloud\FSU_Cloud\Big Data\Projekt\cleaned_data` 文件夹中！
