In [1]:
import pandas as pd
import re

# 读取 CSV 文件
file_path = "./ai_ml_jobs_linkedin.csv"  # 替换为你的 CSV 文件路径

df = pd.read_csv(file_path, encoding="utf-8", low_memory=False)


In [2]:
# 1️⃣ 处理缺失值（改进 inplace=True）
df = df.assign(
    companyName=df["companyName"].fillna("Unknown"),
    sector=df["sector"].fillna("Unknown"),
    publishedAt=pd.to_datetime(df["publishedAt"], errors="coerce")
)

In [3]:
# 2️⃣ 清理 applicationsCount（提取数值）
def clean_applications_count(value):
    value = str(value)
    value = re.sub(r'\D', '', value)  # 只保留数字
    return int(value) if value else None

df["applicationsCount"] = df["applicationsCount"].apply(clean_applications_count)

In [4]:
# 3️⃣ 标准化 contractType 和 experienceLevel
df["contractType"] = df["contractType"].str.lower().str.strip()
df["experienceLevel"] = df["experienceLevel"].str.lower().str.strip()
df["workType"] = df["workType"].str.lower().str.strip()

In [5]:
# 4️⃣ 清理 description（去除多余空格、换行符）
df["description"] = df["description"].str.replace(r'\s+', ' ', regex=True).str.strip()

In [6]:
# 5 预定义 AI 技能关键词列表（从图片提取）
ai_skills = [
    "3D Reconstruction", "Alexa", "Algorithm Analysis", "Algorithm Development",
    "Artificial Intelligence (AI)", "Artificial Neural Networks", "Association Rules",
    "Audio Synthesis", "Autoencoders", "Automated Clustering", "Automated Feature Engineering",
    "Automated Reasoning", "Autoregressive Models", "Caffe", "Classification", "Cognitive Computing",
    "Common Lisp", "Computational Geometry", "Computational Intelligence", "Computational Linguistics",
    "Computer Vision", "Concept Drift Adaptation", "Conditional Image Generation",
    "Convolutional Neural Networks (CNN)", "Data Structures", "Deep Learning", "dSPACE",
    "Evolutionary Algorithms", "Expert Systems", "Facial Recognition", "Feature Extraction",
    "Feature Selection", "Fuzzy Logic", "Gaussian 03", "Generative Adversarial Imitation Learning",
    "Generative Design Optimization", "Generative Flow Models", "Generative Modeling",
    "Generative Neural Networks", "Generative Optimization", "Generative Pre-Training",
    "Generative Query Networks (GQNs)", "Generative Replay Memory", "Generative Synthesis",
    "Gesture Recognition", "Graph Embeddings", "Graph Networks", "Graph Theory",
    "Hyperparameter Optimization", "Hyperparameter Tuning", "IBM Watson", "Image Generation",
    "Image Inpainting", "Image Processing", "Image Synthesis", "Information Extraction",
    "Information Retrieval", "Intelligent Agents", "Jena", "Julia (Programming Language)",
    "Keras", "Knowledge Discovery", "Knowledge Representation and Reasoning", "Linked Data",
    "Lisp", "Machine Learning", "Meta-learning", "Microsoft Azure Machine Learning",
    "Model Compression", "Model Interpretation", "Model Training", "Music Generation",
    "Natural Language Generation", "Natural Language Processing (NLP)",
    "Natural Language Understanding", "Neural Network Architecture Design", "Neural Networks",
    "NLTK", "Object Recognition", "Ontologies", "OpenCV", "Pandas (Software)",
    "Parallel Algorithms", "Parsing", "Pattern Recognition", "Perl Automation",
    "Probabilistic Generative Models", "Probabilistic Programming", "Prompt Engineering",
    "PyTorch", "Question Answering", "RapidMiner", "Recommender Systems", "Reinforcement Learning",
    "Resource Description Framework (RDF)", "Scikit-Learn", "Semantic Technologies",
    "Semantic Web", "Sentiment Analysis", "Smalltalk", "Speech Recognition",
    "Statistical Inference", "Style Transfer", "Supervised Learning", "Support Vector Machine (SVM)",
    "Synthetic Data Generation", "TensorFlow", "Text Analytics", "Text Classification",
    "Text Generation", "Text Mining", "Text-to-Image Generation", "Theano",
    "Time Series Forecasting", "Unsupervised Learning", "Variational Autoencoders",
    "Variational Autoencoders (VAEs)", "Video Generation", "Web Mining", "Weka", "WordNet"
]

#  提取 description 字段中的 AI 技能
def extract_skills(description):
    """从职位描述中提取 AI 相关技能"""
    found_skills = [skill for skill in ai_skills if re.search(rf'\b{re.escape(skill)}\b', description, re.IGNORECASE)]
    return ", ".join(found_skills) if found_skills else None

df["skills"] = df["description"].apply(extract_skills)

In [7]:
import os

# 获取当前 Notebook (`clean_data.ipynb`) 所在目录
current_dir = os.getcwd()

# 获取 `Projekt` 根目录（即 `raw_data` 的上一级目录）
projekt_root = os.path.abspath(os.path.join(current_dir, "..", ".."))

# 定义 `cleaned_data` 目录的正确路径
cleaned_data_dir = os.path.join(projekt_root, "cleaned_data")

# 确保 `cleaned_data` 目录存在
os.makedirs(cleaned_data_dir, exist_ok=True)

# 生成新的文件路径
cleaned_file_path = os.path.join(cleaned_data_dir, "cleaned_ai_ml_jobs.csv")

# 保存文件到 `Projekt/cleaned_data`
df.to_csv(cleaned_file_path, index=False, encoding="utf-8")
print(f"✅ 数据清理完成，文件已保存至 {cleaned_file_path}")


✅ 数据清理完成，文件已保存至 /mnt/g/Nextcloud/FSU_Cloud/Big Data/Projekt/cleaned_data/cleaned_ai_ml_jobs.csv
