In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from scipy.stats import skew
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax
from scipy.stats import boxcox
from scipy.special import inv_boxcox
import seaborn as sb
import matplotlib.pyplot as plt
import lightgbm as lgb
import xgboost as xgb
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, KFold,GridSearchCV
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
from sklearn.preprocessing import StandardScaler,RobustScaler,LabelEncoder,PowerTransformer
from sklearn.ensemble import GradientBoostingRegressor,StackingRegressor, RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA



# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', 5000)
pd.set_option('display.max_rows', 5000)


In [None]:
train = pd.read_csv('/content/drive/MyDrive/kaggle/kaggle_estyle/input/estyle-community-competition-2025/train.csv')
test = pd.read_csv('/content/drive/MyDrive/kaggle/kaggle_estyle/input/estyle-community-competition-2025/test.csv')
sample_sub = pd.read_csv('/content/drive/MyDrive/kaggle/kaggle_estyle/input/estyle-community-competition-2025/sample_submission.csv')

In [None]:
train.head()

In [None]:
train.info()

In [None]:
bad_cols = [c for c in train.columns if train[c].dtype != 'object' and pd.to_numeric(train[c], errors='coerce').isna().sum() > train[c].isna().sum()]


In [None]:
print(bad_cols)

In [None]:
correlation_train=train.select_dtypes(include='number').corr()
sb.set(font_scale=2)
plt.figure(figsize = (50,35))
ax = sb.heatmap(correlation_train, annot=True,annot_kws={"size": 25},fmt='.1f',cmap='PiYG', linewidths=.5)

In [None]:
corr_dict=correlation_train['TradePrice'].sort_values(ascending=False).to_dict()
important_columns=[]
for key,value in corr_dict.items():
    if ((value>0.1) & (value<0.8)) | (value<=-0.1):
        important_columns.append(key)
important_columns

In [None]:
# plt.figure(figsize=(40,20))
# sb.set(font_scale=1.5)
# sb.boxplot(x='BuildingYear', y="TradePrice", data=train)
# sb.swarmplot(x='BuildingYear', y="TradePrice", data=train, color=".25")
# plt.xticks(weight='bold',rotation=90)

In [None]:
train_test=pd.concat([train,test],axis=0,sort=False)
train_test.head()

In [None]:
pd.set_option('display.max_rows', 5000)
train_test_null_info=pd.DataFrame(train_test.isnull().sum(),columns=['Count of NaN'])
train_test_dtype_info=pd.DataFrame(train_test.dtypes,columns=['DataTypes'])
train_tes_info=pd.concat([train_test_null_info,train_test_dtype_info],axis=1)
train_tes_info

# Create Model

In [None]:
test.head()

In [None]:
# 特徴量列を明示的に固定（Id と 目的変数を除く）
feature_cols = [c for c in train.columns if c not in ['Id', 'TradePrice']]

X = train[feature_cols]
y = train['TradePrice']


In [None]:
obj_cols = X.select_dtypes(include=['object']).columns

In [None]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

for col in obj_cols:  # 文字列列の一覧
    le = LabelEncoder()
    both = pd.concat([X[col].fillna("__MISSING__"),
                      test[col].fillna("__MISSING__")], axis=0)
    le.fit(both)
    X[col]    = le.transform(X[col].fillna("__MISSING__"))
    test[col] = le.transform(test[col].fillna("__MISSING__"))


## データの状態チェック

In [None]:
# エンコード後の訓練データとテストデータの確認
print("=== X (訓練データ) の状態 ===")
print(X.head())
print("\nX の型情報:")
print(X.dtypes)
print("\nX の欠損値:")
print(X.isnull().sum()[X.isnull().sum() > 0])

print("\n" + "="*50)
print("=== test (テストデータ) の状態 ===")
print(test.head())
print("\ntest の型情報:")
print(test.dtypes)
print("\ntest の欠損値:")
print(test.isnull().sum()[test.isnull().sum() > 0])

In [None]:
# 提出用のテストデータの確認
print("=== sub_test (提出用テストデータ) の確認 ===")
print(f"feature_cols の数: {len(feature_cols)}")
print(f"sub_test の shape: {test[feature_cols].shape}")
print("\n最初の5行:")
print(test[feature_cols].head())

# 訓練データとテストデータで列の型が一致しているか確認
print("\n=== 型の一致確認 ===")
for col in feature_cols[:10]:  # 最初の10列を確認
    if col in X.columns and col in test.columns:
        print(f"{col}: X={X[col].dtype}, test={test[col].dtype}, 一致={X[col].dtype == test[col].dtype}")

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=52)
print('X_train Shape :',X_train.shape)
print('X_test Shape :',X_test.shape)
print('y_train Shape :',y_train.shape)
print('y_test Shape :',y_test.shape)

In [None]:
lgb_regressor=lgb.LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    feature_fraction=0.201,   # colsample_bytree は書かない
    bagging_fraction=0.65,    # subsample は書かない
    bagging_freq=5,           # subsample_freq は書かない
    n_jobs=-1,
    # 必要なら：
    # force_row_wise=True
)
lgb_regressor.fit(X_train, y_train)
y_head=lgb_regressor.predict(X_test)
print('-'*10+'LGBM'+'-'*10)
print('R square Accuracy: ',r2_score(y_test,y_head))
print('Mean Absolute Error Accuracy: ',mean_absolute_error(y_test,y_head))
print('Mean Squared Error Accuracy: ',mean_squared_error(y_test,y_head))

In [None]:
test_ids = test['Id'].copy()
sub_test  = test[feature_cols]
test_pred = lgb_regressor.predict(sub_test)

submission = pd.DataFrame({'Id': test_ids, 'TradePrice': test_pred})

In [None]:
submission.to_csv('/content/drive/MyDrive/kaggle/kaggle_estyle/output/Sample_submit.csv',index=False)