In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge
from sklearn.multioutput import MultiOutputRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [7]:
# 加载数据
raw_data = pd.read_excel("type15clear.xls").head(13)  # 取前13条已评分数据
scores = pd.read_excel("scored_13.xlsx")      # 专家评分数据

In [9]:
# 确保列名一致
raw_data_columns = raw_data.columns.tolist()
for col in raw_data_columns:
    if "(" in col and ")" in col:
        indicator = col.split("(")[-1].split(")")[0]
        raw_data.rename(columns={col: indicator}, inplace=True)

In [10]:
# 特征工程：处理日期列
raw_data["C1"] = pd.to_datetime(raw_data["C1"])
raw_data["Year"] = raw_data["C1"].dt.year
raw_data["Month"] = raw_data["C1"].dt.month
raw_data.drop("C1", axis=1, inplace=True)  # 删除原始的日期列

In [11]:
# 定义定性特征和定量特征列名
qual_cols = ["C2", "C3", "C4", "C6", "C7", "C8", "C9", "C11", 
             "C13", "C14", "C15", "C16", "C17"]

In [12]:
# 确保这些列在数据中存在
available_columns = raw_data.columns.tolist()
qual_cols = [col for col in qual_cols if col in available_columns]

In [13]:
# 准备数据
X = raw_data[qual_cols]
y = scores  # 假设scores包含多个输出变量，且列名与X一致
# 拆分数据为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 特征预处理：处理缺失值并对特征进行编码
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                                ('encoder', TargetEncoder())]), qual_cols)
    ])

In [14]:
from sklearn.preprocessing import OneHotEncoder  # 用于处理定性特征
# 定义特征处理器
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), qual_cols)  # 对类别特征进行独热编码
    ])

# 定义模型管道
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', MultiOutputRegressor(Ridge(alpha=1.0)))  # 使用岭回归模型
])

# 训练模型
pipeline.fit(X_train, y_train)


In [22]:
# 使用新数据进行预测
new_data = pd.read_excel("type15clear.xls").iloc[13:]  # 获取新数据

# 处理新数据的列名
new_data_columns = new_data.columns.tolist()
for col in new_data_columns:
    if "(" in col and ")" in col:
        indicator = col.split("(")[-1].split(")")[0]
        new_data.rename(columns={col: indicator}, inplace=True)
# 处理新数据的日期列
new_data["C1"] = pd.to_datetime(new_data["C1"])
new_data["Year"] = new_data["C1"].dt.year
new_data["Month"] = new_data["C1"].dt.month
new_data.drop("C1", axis=1, inplace=True)  # 删除原始日期列
# 使用训练好的模型进行预测
predictions = pipeline.predict(new_data[qual_cols])
# 输出预测结果
print(predictions)

[[66.29080979 69.25407571 64.3770259  ... 72.92149795 67.83889529
  61.94728171]
 [71.29792574 74.10295558 69.07679116 ... 65.05527933 67.65694585
  62.68800914]
 [75.68466639 78.99110165 70.88340802 ... 73.6972618  75.76669872
  63.24467792]
 ...
 [67.5304405  67.01496671 65.82367135 ... 74.37330728 72.33482491
  64.34638955]
 [64.5748017  66.81727188 66.55388858 ... 70.44470934 74.6007667
  64.05324481]
 [71.92766212 71.59660027 66.74045248 ... 72.70224868 71.5540148
  63.43250361]]


In [23]:
import pandas as pd
# 假设predictions是一个二维NumPy数组
predictions_df = pd.DataFrame(predictions)
# 显示DataFrame
print(predictions_df)

            0          1          2          3          4          5   \
0    66.290810  69.254076  64.377026  76.472820  68.423341  73.047327   
1    71.297926  74.102956  69.076791  67.937795  69.631294  81.989006   
2    75.684666  78.991102  70.883408  70.845268  71.621075  78.110467   
3    71.291970  74.930462  68.980244  72.972148  73.909190  71.561318   
4    70.917764  72.068777  65.242607  77.897753  74.033765  68.186888   
..         ...        ...        ...        ...        ...        ...   
936  70.444025  64.845781  64.940744  76.889446  69.244695  62.657497   
937  67.984890  66.591996  65.319291  79.376605  71.439584  60.920244   
938  67.530441  67.014967  65.823671  75.518331  70.804848  63.639822   
939  64.574802  66.817272  66.553889  70.842875  70.429315  65.545508   
940  71.927662  71.596600  66.740452  76.395871  70.514351  74.275628   

            6          7          8          9          10         11  \
0    60.292940  63.976700  75.886969  61.326980  6

In [24]:
import pandas as pd
# 假设 predictions 是你模型的预测结果（一个二维 NumPy 数组）
predictions_df = pd.DataFrame(predictions)
# 将预测结果保存为 Excel 文件
predictions_df.to_excel("predictions.xlsx", index=False, sheet_name="Predictions")