# 主要作用： 处理数据集的情况，保证后续代码的应用
> 其实这里边涉及不到太多的特征处理

In [2]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
# 加载数据集
igv_data = pd.read_csv('/home/yanglin/Federated-Time-Series-Forecasting/dataset/A032_IGVData_KPI.csv')
# 检查缺失值，目前来看没有缺失值
missing_values = igv_data.isnull().sum()
print(missing_values)



vehicle_id            0
position_x            0
position_y            0
position_z            0
heading               0
speed                 0
speed_command         0
mileage_distance      0
power_on_time         0
soc                   0
fuel_level            0
local_time            0
chassis_mode          0
estop                 0
charge_status         0
task_state_running    0
task_state_estop      0
task_state_lock       0
task_stage            0
current_task          0
error_code            0
target_location       0
vesselVisitID         0
mission_type          0
 container1_type      0
container2_type       0
vehicle_mode          0
missionID             0
dtype: int64


In [3]:
# 看一下每一列的类型
print(igv_data.dtypes)

vehicle_id             object
position_x            float64
position_y            float64
position_z            float64
heading               float64
speed                 float64
speed_command         float64
mileage_distance      float64
power_on_time         float64
soc                     int64
fuel_level              int64
local_time             object
chassis_mode            int64
estop                  object
charge_status           int64
task_state_running     object
task_state_estop       object
task_state_lock        object
task_stage             object
current_task           object
error_code             object
target_location        object
vesselVisitID          object
mission_type           object
 container1_type        int64
container2_type         int64
vehicle_mode            int64
missionID              object
dtype: object


In [4]:
# 识别分类列和数字列
categorical_cols = igv_data.select_dtypes(include=['object']).columns
numeric_cols = igv_data.select_dtypes(include=['number']).columns
#得到分类列,输出分类列的唯一值
cardinality = igv_data[categorical_cols].nunique()
print(categorical_cols)
print("numeric_cols:\n",numeric_cols)
print("cardinality:\n",cardinality)

Index(['vehicle_id', 'local_time', 'estop', 'task_state_running',
       'task_state_estop', 'task_state_lock', 'task_stage', 'current_task',
       'error_code', 'target_location', 'vesselVisitID', 'mission_type',
       'missionID '],
      dtype='object')
numeric_cols:
 Index(['position_x', 'position_y', 'position_z', 'heading', 'speed',
       'speed_command', 'mileage_distance', 'power_on_time', 'soc',
       'fuel_level', 'chassis_mode', 'charge_status', ' container1_type',
       'container2_type', 'vehicle_mode'],
      dtype='object')
cardinality:
 vehicle_id                1
local_time            10856
estop                     2
task_state_running        2
task_state_estop          2
task_state_lock           2
task_stage                5
current_task              6
error_code               32
target_location          34
vesselVisitID             3
mission_type              5
missionID                56
dtype: int64


## 热编码（暂时不用，后续可以进行使用）
因为local_time是时序特征，我们不做处理，对其余的分类数据我们做热编码


In [5]:

# selected_categorical_cols = ['task_stage', 'current_task', 'error_code', 'vesselVisitID', 'mission_type', 'target_location']
# # 对分类变量进行独热编码
# encoder = OneHotEncoder(sparse=False)
# encoded_categorical_data = encoder.fit_transform(igv_data[selected_categorical_cols])
# encoded_categorical_df = pd.DataFrame(encoded_categorical_data, columns=encoder.get_feature_names_out(selected_categorical_cols))
# # 合并编码后的分类变量和原始数值变量
# processed_data = pd.concat([igv_data[numeric_cols].reset_index(drop=True), encoded_categorical_df], axis=1)
# # 打印合并后的数据前五行
# print(processed_data.head())


## 标签编码
>   其实可以采用标签编码与独立热编码相结合的方式，后续做数据处理的时候可以考虑这种方式


In [6]:
from sklearn.calibration import LabelEncoder

#TODO:有可能对充电状态我判断的不准 将charge_status列转换为二分类标签
igv_data['charge_status'] = igv_data['charge_status'].apply(lambda x: 1 if x in [1, 2, 3, 5] else 0)

label_encoder = LabelEncoder()

for col in categorical_cols:
    igv_data[col] = label_encoder.fit_transform(igv_data[col])

## 导出文件

In [7]:
# 保存处理后的数据为新的CSV文件
igv_data.to_csv('/home/yanglin/Federated-Time-Series-Forecasting/dataset/processed_A032_IGVData_KPI.csv', index=False)

print("数据处理完成并已保存为 processed_A032_IGVData_KPI.csv")

数据处理完成并已保存为 processed_A032_IGVData_KPI.csv
