In [None]:
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
from scipy import stats
from scipy.stats import norm

## Read the data

In [None]:
train = pd.read_csv("../input/estyle-community-competition-2025/train.csv")
test = pd.read_csv("../input/estyle-community-competition-2025/test.csv")

In [None]:
train["TradePrice"].describe()

In [None]:
plt.figure()
sns.histplot(train["TradePrice"], bins=100, kde=False, log_scale=(True, False))
plt.xlabel("TradePrice (log scale)")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

In [None]:
#skewness and kurtosis
print("Skewness: %f" % train['TradePrice'].skew())
print("Kurtosis: %f" % train['TradePrice'].kurt())

In [None]:
fig, ax = plt.subplots()
ax.scatter(x = train['Area'], y = train['TradePrice'])
plt.ylabel('TradePrice', fontsize=13)
plt.xlabel('Area', fontsize=13)
plt.show()


In [None]:
#Deleting outliers
train = train.drop(train[(train['TotalFloorArea']>4000) & (train['TradePrice']<300000)].index)

#Check the graphic again
fig, ax = plt.subplots()
ax.scatter(train['TotalFloorArea'], train['TradePrice'])
plt.ylabel('TradePrice', fontsize=13)
plt.xlabel('TotalFloorArea', fontsize=13)
plt.show()

In [None]:
# 対数変換後の関係を確認
plt.scatter(np.log1p(train["Area"]), np.log1p(train["TradePrice"]))
plt.xlabel("log(Area)")
plt.ylabel("log(TradePrice)")
plt.title("Log-scaled Relationship")

# フラグ別の比較
sns.boxplot(x="AreaIsGreaterFlag", y="TradePrice", data=train)


In [None]:
#box plot overallqual/saleprice
var = 'FloorAreaRatio'
data = pd.concat([np.log1p(train['TradePrice']), train[var]], axis=1)
f, ax = plt.subplots(figsize=(8, 6))
fig = sns.boxplot(x=var, y="TradePrice", data=data)
plt.ylabel('log(TradePrice)')
plt.xticks(rotation=90);f

In [None]:
var = 'BuildingYear'
data = pd.concat([np.log1p(train['TradePrice']), train[var]], axis=1)
f, ax = plt.subplots(figsize=(16, 8))
fig = sns.boxplot(x=var, y="TradePrice", data=data)
plt.ylabel('log(TradePrice)')
plt.xticks(rotation=90);

In [None]:
#correlation matrix
corrmat = train.select_dtypes(include=['number']).corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True);

In [None]:
#saleprice correlation matrix
k = 50 #number of variables for heatmap
cols = corrmat.nlargest(k, 'TradePrice')['TradePrice'].index
cm = np.corrcoef(train[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()

In [None]:
sns.set()
sns.pairplot(train.select_dtypes(include=['number']).sample(1000))
plt.show()

In [None]:
#missing data
total = train.isnull().sum().sort_values(ascending=False)
percent = (train.isnull().sum()/train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(40)

In [None]:
print(train["Remarks"].value_counts())

In [None]:
#standardizing data
saleprice_scaled = StandardScaler().fit_transform(train['TradePrice'].values.reshape(-1, 1))
low_range = saleprice_scaled[saleprice_scaled[:,0].argsort()][:10]
high_range= saleprice_scaled[saleprice_scaled[:,0].argsort()][-10:]
print('outer range (low) of the distribution:')
print(low_range)
print('\nouter range (high) of the distribution:')
print(high_range)

In [None]:
#histogram and normal probability plot
sns.distplot(train['TradePrice'], fit=norm);
fig = plt.figure()
res = stats.probplot(train['TradePrice'], plot=plt)

In [None]:
X_train = train.drop("TradePrice", axis=1)
y_train = train["TradePrice"]
X_test = test.copy()

X_concat = pd.concat([X_train, X_test]).select_dtypes(include=["number"])

## No feature engineering

In [None]:
X_train_id = X_train["Id"]
X_test_id = X_test["Id"]
X_train = X_concat[X_concat["Id"].isin(X_train_id)].fillna(-9999)
X_test = X_concat[X_concat["Id"].isin(X_test_id)].fillna(-9999)
X_train.drop("Id", axis=1, inplace=True)
X_test.drop("Id", axis=1, inplace=True)

## Training

In [None]:
X_train_, X_val, y_train_, y_val = train_test_split(X_train, y_train, random_state=2024)

model = XGBRegressor(learning_rate=1.0, random_state=2024)
model.fit(X_train_, np.log1p(y_train_))
y_pred_val = np.expm1(model.predict(X_val))

score = mean_squared_log_error(y_val, y_pred_val) ** 0.5
print(f"RMSLE score: {score:.5f}")

## Make a submission file

In [None]:
y_pred_test = np.expm1(model.predict(X_test))
submission = pd.DataFrame({"Id": X_test_id, "TradePrice": y_pred_test})
submission.to_csv("./benchmark.csv", index=False)