In [378]:
import gc
import os
import random

import lightgbm as lgb
import numpy as np
import pandas as pd
import seaborn as sns
import itertools
import unicodedata
import optuna
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

from matplotlib import pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
import numpy as np
import pandas as pd
import torch

from sklearn import datasets
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# TabPFNに必要なのは以下の一行
from tabpfn.scripts.transformer_prediction_interface import TabPFNClassifier
# Initializing the label encoder


In [379]:
train=pd.read_csv('../Input/train.csv')
test=pd.read_csv('../Input/test.csv')


In [380]:
def clean_manufacturer(manufacturer):
    # Convert to lowercase
    manufacturer = manufacturer.lower()
    # Convert full-width characters to half-width
    manufacturer = unicodedata.normalize('NFKC', manufacturer)
    # Replace any non-alphanumeric characters with spaces
    manufacturer = ''.join(char if char.isalnum() else ' ' for char in manufacturer)
    # Strip leading/trailing spaces and return
    return manufacturer.strip()

In [381]:
def preprocess(df):
    # 2999年以上のものは-1000するように
    df.loc[df['year'] >= 2999, 'year'] = df.loc[df['year'] >= 2999, 'year'] - 1000
    #cylinders列を数値型に
    df['cylinders'] = df['cylinders'].str.extract('(\d+)').astype(float)
    #年間走行距離
    df['odometer_per_year'] = df['odometer'] / ((df['year'].max() + 1) - df['year'])
    #クレンジング
    df['manufacturer_cleaned'] = df['manufacturer'].apply(clean_manufacturer)
    #車の経過年数
    df['car_age'] = (df['year'].max() + 1) - df['year']
    #paint_colorをワンホットエンコーディング
    df = pd.get_dummies(df, columns=['paint_color'])
    #fuel列の欠損値をgasで補完
    df['fuel'] = df['fuel'].fillna('gas')
    #fuel列をラベルエンコーディング
    df['fuel'] = label_encoder.fit_transform(df['fuel'])
    df.fillna('nan', inplace=True)
    
    for c in ['region','manufacturer_cleaned','condition','fuel','title_status','transmission', 'drive', 'size', 'type', 'state']:
        df[c] = label_encoder.fit_transform(df[c].astype(str))
    #不要な列を削除
    del df['manufacturer']
    del df['type']
    return df

In [382]:
train=preprocess(train)

x = train.drop('price',axis=1)
y = train['price']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=3655)

In [383]:


print(type(train))  # <class 'pandas.core.frame.DataFrame'>
print(train.dtypes)

<class 'pandas.core.frame.DataFrame'>
id                        int64
region                    int32
year                      int64
condition                 int32
cylinders                object
fuel                      int32
odometer                  int64
title_status              int32
transmission              int32
drive                     int32
size                      int32
state                     int32
price                     int64
odometer_per_year       float64
manufacturer_cleaned      int32
car_age                   int64
paint_color_black         uint8
paint_color_blue          uint8
paint_color_brown         uint8
paint_color_custom        uint8
paint_color_green         uint8
paint_color_grey          uint8
paint_color_orange        uint8
paint_color_purple        uint8
paint_color_red           uint8
paint_color_silver        uint8
paint_color_white         uint8
paint_color_yellow        uint8
dtype: object


In [384]:
def mape(y_true, y_pred):
    """Calculate Mean Absolute Percentage Error"""
    percentage_errors = np.abs((y_true - y_pred) / y_true) * 100
    return np.mean(percentage_errors)

In [385]:
# 'nan' を含む行を削除
X_train['cylinders'] = X_train['cylinders'].replace('nan', 2.0)
X_test['cylinders']=X_test['cylinders'].replace('nan', 2.0)

In [386]:
X_train['cylinders']=X_train['cylinders'].astype(float)
X_test['cylinders']=X_test['cylinders'].astype(float)

In [387]:
def objective(trial):
    # ハイパーパラメータの範囲を指定
    param = {
        'objective': 'reg:squarederror',
        'booster': trial.suggest_categorical('booster', ['gbtree', 'gblinear', 'dart']),
        'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),
    }

    if param['booster'] == 'gbtree' or param['booster'] == 'dart':
        param['max_depth'] = trial.suggest_int('max_depth', 1, 9)
        param['eta'] = trial.suggest_float('eta', 1e-8, 1.0, log=True)
        param['gamma'] = trial.suggest_float('gamma', 1e-8, 1.0, log=True)
        param['grow_policy'] = trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide'])

    # 交差検証
    kf = KFold(n_splits=3, shuffle=True, random_state=42)
    scores = []

    for train_index, valid_index in kf.split(X_train):
        train_x, valid_x = X_train.iloc[train_index], X_train.iloc[valid_index]
        train_y, valid_y = y_train.iloc[train_index], y_train.iloc[valid_index]

        model = xgb.XGBRegressor(**param)
        model.fit(train_x, train_y)

        y_pred = model.predict(valid_x)
        mse = mean_squared_error(valid_y, y_pred)
        scores.append(mse)

    return np.mean(scores)

# Optunaでの最適化
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

# 最適なパラメータの表示
print('Best hyperparameters:', study.best_params)

# 最適なパラメータでモデルを訓練
best_model = xgb.XGBRegressor(**study.best_params)
best_model.fit(X_train, y_train)

# テストデータでの評価
y_pred = best_model.predict(X_test)

[32m[I 2023-08-09 14:54:28,019][0m A new study created in memory with name: no-name-85af60d8-b8dd-4569-8105-054cf33df0de[0m
[32m[I 2023-08-09 14:54:28,537][0m Trial 0 finished with value: 84775184.777371 and parameters: {'booster': 'gblinear', 'lambda': 8.854536403717517e-07, 'alpha': 1.6746339969077177e-06}. Best is trial 0 with value: 84775184.777371.[0m
[32m[I 2023-08-09 14:54:30,150][0m Trial 1 finished with value: 70023043.80439377 and parameters: {'booster': 'gbtree', 'lambda': 0.002138990613021913, 'alpha': 0.00904338492019228, 'max_depth': 5, 'eta': 0.06127609715564408, 'gamma': 0.01371057089170352, 'grow_policy': 'lossguide'}. Best is trial 1 with value: 70023043.80439377.[0m
[32m[I 2023-08-09 14:54:37,587][0m Trial 2 finished with value: 302238498.2449493 and parameters: {'booster': 'dart', 'lambda': 2.063555028220612e-07, 'alpha': 0.6540153463060839, 'max_depth': 6, 'eta': 1.3038984227725435e-07, 'gamma': 0.0004563279575031867, 'grow_policy': 'lossguide'}. Best is

Best hyperparameters: {'booster': 'dart', 'lambda': 1.5082879485481008e-08, 'alpha': 0.0781958677792696, 'max_depth': 3, 'eta': 0.12422420503047613, 'gamma': 0.07322456328721001, 'grow_policy': 'lossguide'}


In [388]:
test=preprocess(test)
test['cylinders'] = test['cylinders'].replace('nan', 2.0)
test['cylinders']=test['cylinders'].astype(float)
test_pred=best_model.predict(test)

In [392]:
submission = pd.DataFrame({
    'id': test['id'],  # 'id'はカラム名として指定します
    'prediction': test_pred  # 'prediction' は適切なカラム名に変更してください
})

# CSVファイルとして保存
submission.to_csv('submission.csv', index=False,header=False)