In [1]:
import pandas as pd
import re
# 读取 CSV 文件
file_path = "./jobs_in_data_2024.csv"  # 替换为你的 CSV 文件路径

df = pd.read_csv(file_path, encoding="utf-8", low_memory=False)

print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14199 entries, 0 to 14198
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   work_year           14199 non-null  int64 
 1   experience_level    14199 non-null  object
 2   employment_type     14199 non-null  object
 3   job_title           14199 non-null  object
 4   salary              14199 non-null  int64 
 5   salary_currency     14199 non-null  object
 6   salary_in_usd       14199 non-null  int64 
 7   employee_residence  14199 non-null  object
 8   work_setting        14199 non-null  object
 9   company_location    14199 non-null  object
 10  company_size        14199 non-null  object
 11  job_category        14199 non-null  object
dtypes: int64(3), object(9)
memory usage: 1.3+ MB
None
   work_year experience_level employment_type               job_title  salary  \
0       2024      Entry-level       Freelance  Applied Data Scientist   30000   
1 

In [2]:
import pandas as pd

# 读取数据
file_path = "./jobs_in_data_2024.csv"
df = pd.read_csv(file_path, encoding="utf-8", low_memory=False)

# 1️⃣ 统一文本格式
df["experience_level"] = df["experience_level"].str.lower().str.strip()
df["employment_type"] = df["employment_type"].str.lower().str.strip()
df["work_setting"] = df["work_setting"].str.lower().str.strip()
df["job_title"] = df["job_title"].str.title().str.strip()
df["company_size"] = df["company_size"].replace({"S": "Small", "M": "Medium", "L": "Large"})

# 2️⃣ 确保数据类型正确
df["work_year"] = pd.to_numeric(df["work_year"], errors="coerce").astype("Int64")
df["salary"] = pd.to_numeric(df["salary"], errors="coerce").astype("Int64")
df["salary_in_usd"] = pd.to_numeric(df["salary_in_usd"], errors="coerce").astype("Int64")

# 3️⃣ 处理缺失值（修复 `FutureWarning`）
df = df.assign(
    experience_level=df["experience_level"].fillna("unknown"),
    employment_type=df["employment_type"].fillna("unknown"),
    job_category=df["job_category"].fillna("unknown"),
    salary_in_usd=df["salary_in_usd"].fillna(df.groupby("job_category")["salary_in_usd"].transform("median"))
)

# 4️⃣ 处理异常工资数据
q1 = df["salary_in_usd"].quantile(0.05)
q3 = df["salary_in_usd"].quantile(0.95)
df = df[(df["salary_in_usd"] >= q1) & (df["salary_in_usd"] <= q3)]

# 5️⃣ 确保年份合理
df = df[(df["work_year"] >= 2000) & (df["work_year"] <= 2024)]

# 6️⃣ 重新命名字段
df.rename(columns={"salary": "SalaryLocal", "salary_in_usd": "SalaryUSD"}, inplace=True)

# 7️⃣ 统一国家名称
df["employee_residence"] = df["employee_residence"].str.title().str.strip()
df["company_location"] = df["company_location"].str.title().str.strip()

# # 8️⃣ 保存清理后的数据
# cleaned_file_path = "./cleaned_jobs_in_data_2024.csv"
# df.to_csv(cleaned_file_path, index=False, encoding="utf-8")

# print(f"✅ 数据清理完成，文件已保存至 {cleaned_file_path}")




In [3]:
import os

# 获取当前 Notebook (`clean_data.ipynb`) 所在目录
current_dir = os.getcwd()

# 获取 `Projekt` 根目录（即 `raw_data` 的上一级目录）
projekt_root = os.path.abspath(os.path.join(current_dir, "..", ".."))

# 定义 `cleaned_data` 目录的正确路径
cleaned_data_dir = os.path.join(projekt_root, "cleaned_data")

# 确保 `cleaned_data` 目录存在
os.makedirs(cleaned_data_dir, exist_ok=True)

# 生成新的文件路径
cleaned_file_path = os.path.join(cleaned_data_dir, "cleaned_jobs_in_data_2024.csv")

# 保存文件到 `Projekt/cleaned_data`
df.to_csv(cleaned_file_path, index=False, encoding="utf-8")
print(f"✅ 数据清理完成，文件已保存至 {cleaned_file_path}")

# 8️⃣ 预览清理后的数据
print(df.info())
print(df.head())


✅ 数据清理完成，文件已保存至 G:\Nextcloud\FSU_Cloud\Big Data\Projekt\cleaned_data\cleaned_jobs_in_data_2024.csv
<class 'pandas.core.frame.DataFrame'>
Index: 12780 entries, 1 to 14198
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   work_year           12780 non-null  Int64 
 1   experience_level    12780 non-null  object
 2   employment_type     12780 non-null  object
 3   job_title           12780 non-null  object
 4   SalaryLocal         12780 non-null  Int64 
 5   salary_currency     12780 non-null  object
 6   SalaryUSD           12780 non-null  Int64 
 7   employee_residence  12780 non-null  object
 8   work_setting        12780 non-null  object
 9   company_location    12780 non-null  object
 10  company_size        12780 non-null  object
 11  job_category        12780 non-null  object
dtypes: Int64(3), object(9)
memory usage: 1.3+ MB
None
   work_year experience_level employment_type              job_title  