In [1]:
import pandas as pd

# 读取数据
file_path = "./robotics_data.csv"  # 替换为你的 CSV 文件路径
df = pd.read_csv(file_path, encoding="utf-8", low_memory=False)

# 1️⃣ 标准化列名（移除 `_`，改为 PascalCase）
df.columns = ["Year", "Industry", "RobotsAdopted", "ProductivityGain", "CostSavings", "JobsDisplaced", "TrainingHours"]

# 2️⃣ 统一 Industry 格式
df["Industry"] = df["Industry"].str.title().str.strip()

# 3️⃣ 确保数值字段的正确性
df["Year"] = pd.to_numeric(df["Year"], errors="coerce").astype("Int64")
df["RobotsAdopted"] = pd.to_numeric(df["RobotsAdopted"], errors="coerce").astype("Int64")
df["JobsDisplaced"] = pd.to_numeric(df["JobsDisplaced"], errors="coerce").astype("Int64")
df["TrainingHours"] = pd.to_numeric(df["TrainingHours"], errors="coerce").astype("Int64")

# 4️⃣ 处理 ProductivityGain & CostSavings
df["ProductivityGain"] = df["ProductivityGain"].round(2)  # 保留 2 位小数
df["CostSavings"] = df["CostSavings"].round(2)  # 保留 2 位小数

# 5️⃣ 处理缺失值（如果有）
df = df.dropna()  # 删除缺失值
# 或者用行业均值填充
# df["ProductivityGain"].fillna(df.groupby("Industry")["ProductivityGain"].transform("mean"), inplace=True)

# # 6️⃣ 保存清理后的数据
# cleaned_file_path = "./cleaned_robotics_data.csv"
# df.to_csv(cleaned_file_path, index=False, encoding="utf-8")

# print(f"✅ 数据清理完成，文件已保存至 {cleaned_file_path}")

# # 7️⃣ 预览清理后的数据
# print(df.info())
# print(df.head())


In [2]:
import os

# 获取当前 Notebook (`clean_data.ipynb`) 所在目录
current_dir = os.getcwd()

# 获取 `Projekt` 根目录（即 `raw_data` 的上一级目录）
projekt_root = os.path.abspath(os.path.join(current_dir, "..", ".."))

# 定义 `cleaned_data` 目录的正确路径
cleaned_data_dir = os.path.join(projekt_root, "cleaned_data")

# 确保 `cleaned_data` 目录存在
os.makedirs(cleaned_data_dir, exist_ok=True)

# 生成新的文件路径
cleaned_file_path = os.path.join(cleaned_data_dir, "cleaned_robotics_data.csv")

# 保存文件到 `Projekt/cleaned_data`
df.to_csv(cleaned_file_path, index=False, encoding="utf-8")
print(f"✅ 数据清理完成，文件已保存至 {cleaned_file_path}")

# 8️⃣ 预览清理后的数据
print(df.info())
print(df.head())


✅ 数据清理完成，文件已保存至 /mnt/g/Nextcloud/FSU_Cloud/Big Data/Projekt/cleaned_data/cleaned_robotics_data.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27 entries, 0 to 26
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Year              27 non-null     Int64  
 1   Industry          27 non-null     object 
 2   RobotsAdopted     27 non-null     Int64  
 3   ProductivityGain  27 non-null     float64
 4   CostSavings       27 non-null     float64
 5   JobsDisplaced     27 non-null     Int64  
 6   TrainingHours     27 non-null     Int64  
dtypes: Int64(4), float64(2), object(1)
memory usage: 1.7+ KB
None
   Year       Industry  RobotsAdopted  ProductivityGain  CostSavings  \
0  2015  Manufacturing            107              7.86       170.67   
1  2015     Healthcare            484             24.77       120.19   
2  2015      Logistics            263             20.74       152.53   
3  2016  Manufacturin