In [1]:
import pandas as pd
import re
# 读取 CSV 文件
file_path = "./data_science_salaries.csv"  # 替换为你的 CSV 文件路径

df = pd.read_csv(file_path, encoding="utf-8", low_memory=False)

print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6599 entries, 0 to 6598
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   job_title           6599 non-null   object
 1   experience_level    6599 non-null   object
 2   employment_type     6599 non-null   object
 3   work_models         6599 non-null   object
 4   work_year           6599 non-null   int64 
 5   employee_residence  6599 non-null   object
 6   salary              6599 non-null   int64 
 7   salary_currency     6599 non-null   object
 8   salary_in_usd       6599 non-null   int64 
 9   company_location    6599 non-null   object
 10  company_size        6599 non-null   object
dtypes: int64(3), object(8)
memory usage: 567.2+ KB
None
        job_title experience_level employment_type work_models  work_year  \
0   Data Engineer        Mid-level       Full-time      Remote       2024   
1   Data Engineer        Mid-level       Full-time      Re

In [2]:
# 1️⃣ 处理缺失值（如果存在）
df = df.dropna()  # 直接删除缺失值（如果数据较少）

In [3]:
# 2️⃣ 统一经验级别 experience_level
df["experience_level"] = df["experience_level"].str.lower().str.strip()

# 3️⃣ 统一雇佣类型 employment_type
df["employment_type"] = df["employment_type"].str.lower().str.strip()

# 4️⃣ 统一工作模式 work_models
df["work_models"] = df["work_models"].str.lower().str.strip()

In [4]:
# 5️⃣ 转换 `work_year` 为整数
df["work_year"] = pd.to_numeric(df["work_year"], errors="coerce")

# 6️⃣ 处理 `salary` 相关字段
df["salary"] = pd.to_numeric(df["salary"], errors="coerce")  # 确保是数值类型
df["salary_in_usd"] = pd.to_numeric(df["salary_in_usd"], errors="coerce")  # 确保是数值类型


In [5]:
# 7️⃣ 标准化 `company_size`
df["company_size"] = df["company_size"].str.lower().str.strip()

In [6]:
# 8️⃣ 标准化 `company_location` & `employee_residence`
df["company_location"] = df["company_location"].str.title().str.strip()  # 首字母大写
df["employee_residence"] = df["employee_residence"].str.title().str.strip()  # 首字母大写


In [7]:
# 9️⃣ 确保数据类型正确
df = df.astype({
    "experience_level": "category",
    "employment_type": "category",
    "work_models": "category",
    "company_size": "category",
    "work_year": "int64",
    "salary": "int64",
    "salary_in_usd": "int64"
})

In [8]:
import os

# 获取当前 Notebook (`clean_data.ipynb`) 所在目录
current_dir = os.getcwd()

# 获取 `Projekt` 根目录（即 `raw_data` 的上一级目录）
projekt_root = os.path.abspath(os.path.join(current_dir, "..", ".."))

# 定义 `cleaned_data` 目录的正确路径
cleaned_data_dir = os.path.join(projekt_root, "cleaned_data")

# 确保 `cleaned_data` 目录存在
os.makedirs(cleaned_data_dir, exist_ok=True)

# 生成新的文件路径
cleaned_file_path = os.path.join(cleaned_data_dir, "cleaned_data_science_salaries.csv")

# 保存文件到 `Projekt/cleaned_data`
df.to_csv(cleaned_file_path, index=False, encoding="utf-8")
print(f"✅ 数据清理完成，文件已保存至 {cleaned_file_path}")

✅ 数据清理完成，文件已保存至 G:\Nextcloud\FSU_Cloud\Big Data\Projekt\cleaned_data\cleaned_data_science_salaries.csv


In [9]:
# # 🔟 保存清理后的数据
# cleaned_file_path = "./cleaned_data_science_salaries.csv"
# df.to_csv(cleaned_file_path, index=False, encoding="utf-8")

# print(f"✅ 数据清理完成，文件已保存至 {cleaned_file_path}")

# # 🔍 预览清理后的数据
# print(df.info())
# print(df.head())