In [1]:
import pandas as pd
import numpy as np
import re

# ✅ 1️⃣ 读取数据
job_profiles_file = "./Types of Different Job Profiles.txt"
salary_dataset_file = "./Salary_Dataset_with_Extra_Features.csv"
software_salaries_file = "./Software_Professional_Salaries.csv"

# 读取职位名称标准库
with open(job_profiles_file, "r", encoding="utf-8") as f:
    job_titles = set(line.strip().lower() for line in f if line.strip())

# 读取 CSV 数据
df_salary = pd.read_csv(salary_dataset_file, encoding="utf-8")
df_software = pd.read_csv(software_salaries_file, encoding="utf-8")

# 🔍 打印列名
print("Columns in df_salary:", df_salary.columns.tolist())
print("Columns in df_software:", df_software.columns.tolist())

Columns in df_salary: ['Rating', 'Company Name', 'Job Title', 'Salary', 'Salaries Reported', 'Location', 'Employment Status', 'Job Roles']
Columns in df_software: ['Rating', 'Company Name', 'Job Title', 'Salary', 'Salaries Reported', 'Location']


In [2]:
df_salary.columns = df_salary.columns.str.lower().str.strip().str.replace(" ", "_")
df_software.columns = df_software.columns.str.lower().str.strip().str.replace(" ", "_")

In [3]:


# ✅ 3️⃣ 标准化职位名称（匹配 `Types of Different Job Profiles.txt`）
def standardize_job_title(title):
    """匹配职位名称，使其标准化"""
    title = title.lower().strip()
    for job in job_titles:
        if job in title:
            return job.title()  # 统一格式
    return title.title()  # 无匹配时，首字母大写

df_salary["job_title"] = df_salary["job_title"].apply(standardize_job_title)
df_software["job_title"] = df_software["job_title"].apply(standardize_job_title)

# ✅ 4️⃣ 添加默认货币（假设所有工资均为 USD）
df_salary["salary_currency"] = "USD"
df_software["salary_currency"] = "USD"


In [4]:
# ✅ 确保 `salary_currency` 反映真实情况（所有工资默认是 INR）
df_salary["salary_currency"] = "INR"
df_software["salary_currency"] = "INR"
# ✅ 统一工资单位（INR → USD）
def convert_salary_to_usd(row):
    conversion_rates = {"INR": 0.012}  # 1 INR ≈ 0.012 USD
    return row["salary"] * conversion_rates.get(row["salary_currency"], 1)

df_salary["salary_usd"] = df_salary.apply(convert_salary_to_usd, axis=1)
df_software["salary_usd"] = df_software.apply(convert_salary_to_usd, axis=1)

# ✅ 6️⃣ 处理异常工资数据（去掉极端值）
q1, q3 = df_salary["salary_usd"].quantile([0.05, 0.95])
df_salary = df_salary[(df_salary["salary_usd"] >= q1) & (df_salary["salary_usd"] <= q3)]

q1, q3 = df_software["salary_usd"].quantile([0.05, 0.95])
df_software = df_software[(df_software["salary_usd"] >= q1) & (df_software["salary_usd"] <= q3)]

# ✅ 解决 `SettingWithCopyWarning`
df_salary = df_salary.copy()
df_software = df_software.copy()

df_salary.loc[:, "salary_usd"] = df_salary.apply(convert_salary_to_usd, axis=1)
df_software.loc[:, "salary_usd"] = df_software.apply(convert_salary_to_usd, axis=1)


In [5]:


# ✅ 1️⃣1️⃣ 合并两个数据集
df_combined = pd.concat([df_salary, df_software], ignore_index=True)

# # ✅ 1️⃣2️⃣ 导出清理后的数据
# cleaned_file_path = "./cleaned_combined_salaries.csv"
# df_combined.to_csv(cleaned_file_path, index=False, encoding="utf-8")

# print(f"✅ 数据清理完成，文件已保存至 {cleaned_file_path}")

# # ✅ 1️⃣3️⃣ 预览数据
# display(df_combined.head())  # 在 Jupyter Notebook 显示数据
# print(df_combined.info())  # 显示 DataFrame 结构

# # ✅ 导出 CSV 文件
# cleaned_file_path = "cleaned_combined_salaries.csv"
# df_combined.to_csv(cleaned_file_path, index=False, encoding="utf-8")
# print(f"✅ 数据清理完成，文件已保存至 {cleaned_file_path}")


In [6]:
import os

# 获取当前 Notebook (`clean_data.ipynb`) 所在目录
current_dir = os.getcwd()

# 获取 `Projekt` 根目录（即 `raw_data` 的上一级目录）
projekt_root = os.path.abspath(os.path.join(current_dir, "..", ".."))

# 定义 `cleaned_data` 目录的正确路径
cleaned_data_dir = os.path.join(projekt_root, "cleaned_data")

# 确保 `cleaned_data` 目录存在
os.makedirs(cleaned_data_dir, exist_ok=True)

# 生成新的文件路径
cleaned_file_path = os.path.join(cleaned_data_dir, "cleaned_combined_salaries.csv")

# 保存文件到 `Projekt/cleaned_data`
df_combined.to_csv(cleaned_file_path, index=False, encoding="utf-8")
print(f"✅ 数据清理完成，文件已保存至 {cleaned_file_path}")

# 8️⃣ 预览清理后的数据
print(df_combined.info())
print(df_combined.head())

✅ 数据清理完成，文件已保存至 G:\Nextcloud\FSU_Cloud\Big Data\Projekt\cleaned_data\cleaned_combined_salaries.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41498 entries, 0 to 41497
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   rating             41498 non-null  float64
 1   company_name       41496 non-null  object 
 2   job_title          41498 non-null  object 
 3   salary             41498 non-null  int64  
 4   salaries_reported  41498 non-null  int64  
 5   location           41498 non-null  object 
 6   employment_status  20747 non-null  object 
 7   job_roles          20747 non-null  object 
 8   salary_currency    41498 non-null  object 
 9   salary_usd         41498 non-null  float64
dtypes: float64(2), int64(2), object(6)
memory usage: 3.2+ MB
None
   rating                      company_name job_title   salary  \
0     3.8                            Sasken   Android   400000   
1     4.5  Advanc