In [35]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
# 必要なライブラリのインポート
import pandas as pd
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [36]:
train=pd.read_csv('../Input/train.csv')
test=pd.read_csv('../Input/test.csv')
train['price'] = np.log(train['price'])

In [37]:
size_corrections = {
    "fullーsize": "full-size",
    "full−size": "full-size",
    "midーsize": "mid-size",
    "mid−size": "mid-size",
    "subーcompact": "sub-compact"
}

columns_to_encode = ['condition', 'title_status', 'transmission', 'drive', 'size']

train.drop(train[(train['condition'] == 'new') & (train['odometer'] != 0)].index, inplace=True)
train = train[~((train['year'] < 2000) & (train['fuel'] == "electric"))]

In [38]:
def preprocess(df):
    # Correct the year values
    df.loc[df['year'] >= 2999, 'year'] -= 1000
    
    # Make odometer values absolute
    df['odometer'] = df['odometer'].abs()
    
    # Extract number of cylinders and handle missing values
    df['cylinders'] = df['cylinders'].str.extract('(\d+)').astype(float)
    df['cylinders'].fillna(6.0, inplace=True)
    
    # Compute odometer per year
    df['odometer_per_year'] = df['odometer'] / ((df['year'].max() + 1) - df['year'])
    
    # Compute age of the car
    df['car_age'] = (df['year'].max() + 1) - df['year']
    
    # One-hot encode paint_color
    df = pd.get_dummies(df, columns=['paint_color'], prefix=['color'])
    
    # Fill missing values in fuel and label encode
    df['fuel'].fillna('gas', inplace=True)
    le = LabelEncoder()
    df['fuel'] = le.fit_transform(df['fuel'])
    
    # Fill missing values in 'state' column based on the 'region' column using the mapping
    region_to_state_mapping = df[df['state'].notnull()].groupby('region')['state'].first().to_dict()
    df['state'].fillna(df['region'].map(region_to_state_mapping), inplace=True)
    
    regions_with_missing_states = df[df['state'].isnull()]['region'].unique()
    regions_info_in_test = test[test['region'].isin(regions_with_missing_states)][['region', 'state']].drop_duplicates()
    # Create a mapping from the new dataset to fill missing states in the original dataset
    region_to_state_fill = regions_info_in_test.set_index('region').to_dict()['state']

    # Fill missing states in the original dataset using the mapping
    df['state'].fillna(df['region'].map(region_to_state_fill), inplace=True)
    df = pd.get_dummies(df, columns=['state'], prefix=['state'])
    
    manufacturer_counts = df['manufacturer'].value_counts().to_dict()
    df['manufacturer'] = df['manufacturer'].map(manufacturer_counts)
    df['type'].fillna(df['type'].mode()[0], inplace=True)
    df['title_status'].fillna(df['title_status'].mode()[0], inplace=True)
    df['size'].replace(size_corrections, inplace=True)
    for col in columns_to_encode:
        le = LabelEncoder()
        df[col + '_encoded'] = le.fit_transform(df[col])
        del df[col]
    
    df = pd.get_dummies(df, columns=['type'], prefix=['type'])
    del df['region']
    return df

In [39]:
train=preprocess(train)
test=preprocess(test)

In [40]:
X = train.drop('price', axis=1)  # 'target_column_name'はターゲット変数のカラム名に置き換えてください
y = train['price']

In [41]:
# ハイパーパラメーターチューニングのための関数定義
def objective(params):
    params = {
        'depth': int(params['depth']),
        'learning_rate': params['learning_rate'],
        'l2_leaf_reg': params['l2_leaf_reg'],
    }
    
    cat = CatBoostRegressor(iterations=500, **params)
    
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = []
    for train_idx, val_idx in kf.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        cat.fit(X_train, y_train, verbose=0)
        preds = cat.predict(X_val)
        score = mean_squared_error(y_val, preds)
        scores.append(score)
    
    return {'loss': np.mean(scores), 'status': STATUS_OK}

In [42]:
# パラメータ空間の定義
space = {
    'depth': hp.quniform('depth', 6, 10, 1),
    'learning_rate' : hp.uniform('learning_rate', 0.01, 0.3),
    'random_strength': hp.quniform('random_strength', 0, 100, 1),
    'bagging_temperature': hp.uniform('bagging_temperature', 0.01, 100.00),
    'od_type': hp.choice('od_type', ['IncToDec', 'Iter']),
    'od_wait': hp.quniform('od_wait', 10, 50, 1),
    'l2_leaf_reg': hp.uniform('l2_leaf_reg', 1, 10),
}

# ハイパーパラメーターチューニングの実行
trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=100, trials=trials)

100%|██████████| 100/100 [31:40<00:00, 19.01s/trial, best loss: 0.3468265623871432]


In [43]:
cat = CatBoostRegressor(iterations=1000, **best)
cat.fit(X, y)

# テストデータでの予測
preds_test = cat.predict(test)

y_pred_original = np.exp(preds_test)

submission = pd.DataFrame({
    'id': test['id'],  # 'id'はカラム名として指定します
    'prediction': y_pred_original  # 'prediction' は適切なカラム名に変更してください
})

submission.to_csv('submission_cat3.csv', index=False,header=False)



CatBoostError: C:/Go_Agent/pipelines/BuildMaster/catboost.git/catboost/private/libs/options/json_helper.h:173: Can't parse parameter "type" with value: 0