In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
import lightgbm as lgb

# --------------------------
# 缺失值填充函数
# --------------------------
def bodyType_fill(train, test):
    train_value = train['bodyType'].mode()[0]
    train['bodyType'] = train['bodyType'].fillna(train_value)
    test['bodyType'] = test['bodyType'].fillna(train_value)

def fuelType_fill(train, test):
    train_value = train['fuelType'].mode()[0]
    train['fuelType'] = train['fuelType'].fillna(train_value)
    test['fuelType'] = test['fuelType'].fillna(train_value)

def gearbox_fill(train, test):
    train_value = train['gearbox'].mode()[0]
    train['gearbox'] = train['gearbox'].fillna(train_value)
    test['gearbox'] = test['gearbox'].fillna(train_value)

def power_fill(train, test):
    train_value = train['power'].median()
    train['power'] = train['power'].fillna(train_value)
    test['power'] = test['power'].fillna(train_value)

def kilometer_fill(train, test):
    train_value = train['kilometer'].mean()
    train['kilometer'] = train['kilometer'].fillna(train_value)
    test['kilometer'] = test['kilometer'].fillna(train_value)

def notRepairedDamage_fill(train, test):
    for df in [train, test]:
        df['notRepairedDamage'] = df['notRepairedDamage'].replace('-', np.nan)
        df['notRepairedDamage'] = df['notRepairedDamage'].astype('category').cat.codes
    mode_value = train['notRepairedDamage'].mode()[0]
    train['notRepairedDamage'] = train['notRepairedDamage'].fillna(mode_value)
    test['notRepairedDamage'] = test['notRepairedDamage'].fillna(mode_value)

# --------------------------
# 日期处理函数
# --------------------------
def process_dates(df, date_col, mode_date=None):
    date_str = df[date_col].astype(str).str.strip()
    valid_mask = (
        (date_str.str.len() == 8) &
        (date_str.str.isdigit()) &
        (date_str.str[4:6].between('01', '12')) &
        (date_str.str[6:8].between('01', '31'))
    )
    dates = pd.to_datetime(
        date_str.where(valid_mask, np.nan),
        format='%Y%m%d',
        errors='coerce'
    )
    if mode_date is not None:
        return dates.fillna(mode_date)
    else:
        mode_date = dates.mode()[0] if not dates.mode().empty else pd.NaT
        return dates.fillna(mode_date), mode_date

# --------------------------
# 主流程函数
# --------------------------
def load_data():
    train = pd.read_csv('used_car_train_20200313.csv', sep=' ')
    test = pd.read_csv('used_car_testB_20200421.csv', sep=' ')
    return train, test

def data_processing(train, test):
    # 缺失值处理
    notRepairedDamage_fill(train, test)
    bodyType_fill(train, test)
    fuelType_fill(train, test)
    gearbox_fill(train, test)
    power_fill(train, test)
    kilometer_fill(train, test)

    # 日期处理
    for col in ['creatDate', 'regDate']:
        train[col], mode_date = process_dates(train, col)
        test[col] = process_dates(test, col, mode_date)

    # 高频类别编码
    for col in ['brand', 'regionCode']:
        freq_map = train[col].value_counts(normalize=True)
        train[f'{col}_freq'] = train[col].map(freq_map)
        test[f'{col}_freq'] = test[col].map(freq_map)

    # 匿名特征标准化
    v_cols = [f'v_{i}' for i in range(15)]
    scaler = StandardScaler()
    train[v_cols] = scaler.fit_transform(train[v_cols])
    test[v_cols] = scaler.transform(test[v_cols])

    # 分箱处理
    kilometer_bins = pd.cut(train['kilometer'], bins=10, retbins=True)[1]
    train['kilometer_bin'] = pd.cut(train['kilometer'], bins=kilometer_bins, labels=False)
    test['kilometer_bin'] = pd.cut(test['kilometer'], bins=kilometer_bins, labels=False)

    return train, test

def feature_engineering(df):
    """核心修正点：确保在删除日期列前完成特征衍生"""
    if 'creatDate' in df.columns and 'regDate' in df.columns:
        df['car_age'] = (df['creatDate'].dt.year - df['regDate'].dt.year) + \
                        (df['creatDate'].dt.month - df['regDate'].dt.month)/12

    df['power'] = np.clip(df['power'], 0, 600)

    drop_cols = ['SaleID', 'name', 'regDate', 'creatDate',
                 'model', 'seller', 'offerType']
    return df.drop(columns=drop_cols, errors='ignore')

def lgb_model(train, test):
    y_train = np.log1p(train['price'])
    X_train = feature_engineering(train.drop(columns='price'))
    X_test = feature_engineering(test)

    params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': 'mae',
        'learning_rate': 0.01,
        'num_leaves': 31,
        'feature_fraction': 0.8,
        'bagging_freq': 5,
        'verbosity': -1
    }

    folds = KFold(n_splits=5, shuffle=True, random_state=42)
    test_preds = np.zeros(X_test.shape[0])

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
        print(f"Fold {fold_+1}")
        trn_data = lgb.Dataset(X_train.iloc[trn_idx], y_train.iloc[trn_idx])
        val_data = lgb.Dataset(X_train.iloc[val_idx], y_train.iloc[val_idx])

        model = lgb.train(
            params,
            trn_data,
            num_boost_round=10000,
            valid_sets=[trn_data, val_data],
            callbacks=[lgb.early_stopping(100), lgb.log_evaluation(500)]
        )
        test_preds += model.predict(X_test) / folds.n_splits

    return np.expm1(test_preds)

if __name__ == '__main__':
    train, test = load_data()
    train, test = data_processing(train, test)
    predictions = lgb_model(train, test)
    submission = pd.DataFrame({'SaleID': test['SaleID'], 'price': predictions})
    submission.to_csv('submission.csv', index=False)

Fold 1
Training until validation scores don't improve for 100 rounds
[500]	training's l1: 0.156474	valid_1's l1: 0.160842
[1000]	training's l1: 0.14106	valid_1's l1: 0.147847
[1500]	training's l1: 0.134384	valid_1's l1: 0.143192
[2000]	training's l1: 0.129577	valid_1's l1: 0.140266
[2500]	training's l1: 0.125639	valid_1's l1: 0.138002
[3000]	training's l1: 0.122298	valid_1's l1: 0.136342
[3500]	training's l1: 0.119374	valid_1's l1: 0.135041
[4000]	training's l1: 0.116793	valid_1's l1: 0.133969
[4500]	training's l1: 0.114517	valid_1's l1: 0.133137
[5000]	training's l1: 0.112351	valid_1's l1: 0.132397
[5500]	training's l1: 0.11037	valid_1's l1: 0.131747
[6000]	training's l1: 0.1085	valid_1's l1: 0.131196
[6500]	training's l1: 0.106758	valid_1's l1: 0.130676
[7000]	training's l1: 0.105079	valid_1's l1: 0.130191
[7500]	training's l1: 0.103435	valid_1's l1: 0.12973
[8000]	training's l1: 0.101838	valid_1's l1: 0.129221
[8500]	training's l1: 0.100303	valid_1's l1: 0.128689
[9000]	training's l

In [8]:
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV

# 转换为LGBMRegressor接口
model = lgb.LGBMRegressor(
    boosting_type='gbdt',
    objective='regression',
    metric='mae',
    learning_rate=0.01,
    verbosity=-1
)

# 定义参数网格
param_grid = {
    'num_leaves': [31, 63],
    'max_depth': [5, 7],
    'min_data_in_leaf': [50, 100],
    'feature_fraction': [0.7, 0.8],
    'bagging_fraction': [0.8, 0.9],
    'bagging_freq': [5],
    'lambda_l1': [0, 0.1],
    'lambda_l2': [0, 0.1]
}

# 网格搜索
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='neg_mean_absolute_error',
    cv=5,
    n_jobs=-1,
    verbose=2
)

train, test = load_data()
train, test = data_processing(train, test)

y_train = np.log1p(train['price'])
X_train = feature_engineering(train.drop(columns='price'))
grid_search.fit(X_train, y_train)
# 输出最优参数
print("Best parameters:", grid_search.best_params_)
print("Best CV score:", -grid_search.best_score_)

Fitting 5 folds for each of 128 candidates, totalling 640 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best parameters: {'bagging_fraction': 0.8, 'bagging_freq': 5, 'feature_fraction': 0.8, 'lambda_l1': 0.1, 'lambda_l2': 0, 'max_depth': 7, 'min_data_in_leaf': 50, 'num_leaves': 63}
Best CV score: 0.421072461202668


In [11]:
X_test = feature_engineering(test)
# 使用最优参数重新训练
best_params = grid_search.best_params_
best_params.update({
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'mae',
    'verbosity': -1
})

# KFold训练与预测
folds = KFold(n_splits=5, shuffle=True, random_state=42)
test_preds = np.zeros(X_test.shape[0])

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
    trn_data = lgb.Dataset(X_train.iloc[trn_idx], y_train.iloc[trn_idx])
    val_data = lgb.Dataset(X_train.iloc[val_idx], y_train.iloc[val_idx])

    model = lgb.train(
        best_params,
        trn_data,
        valid_sets=[trn_data, val_data],
        num_boost_round=10000,
        callbacks=[lgb.early_stopping(100)]
    )

    test_preds += model.predict(X_test) / folds.n_splits

submission['price'] = np.expm1(test_preds)
submission.to_csv('tuned_submission.csv', index=False)

Early stopping, best iteration is:
[1316]	training's l1: 0.0838057	valid_1's l1: 0.12848
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1803]	training's l1: 0.0742988	valid_1's l1: 0.127087
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2227]	training's l1: 0.066963	valid_1's l1: 0.127277
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2330]	training's l1: 0.0655437	valid_1's l1: 0.125738
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1972]	training's l1: 0.0715485	valid_1's l1: 0.124573
