In [None]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

#### &#10140; *EDA*</br>

In [None]:
trainData = pd.read_csv('./data/train.csv')

print('Data shape: ', trainData.shape)

In [None]:
trainData.drop(['Id'], axis=1, inplace=True)

print(trainData.info())

### -- `Log-Transformation` -- 

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(trainData['SalePrice'], bins=30, kde=True)
plt.title('Distribution of SalePrice')
plt.xlabel('SalePrice')


trainData['SalePrice'] = np.log1p(trainData['SalePrice'])
plt.figure(figsize=(10, 6))
sns.histplot(trainData['SalePrice'], bins=30, kde=True)
plt.title('Log-Transformed Distribution of SalePrice')
plt.xlabel('Log(SalePrice)')

### -- `Missing Data` -- 

In [None]:
missing = trainData.isnull().sum() 
missing = missing[missing > 0].sort_values(ascending=False)

print(missing.head(20))

In [None]:
for col in ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu',
            'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
            'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'BsmtQual', 'BsmtCond',
            'MasVnrType']:
    trainData[col] = trainData[col].fillna("None")

for col in ['GarageYrBlt', 'GarageArea', 'GarageCars', 'BsmtFinSF1',
            'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 
            'BsmtHalfBath', 'MasVnrArea']:
    trainData[col] = trainData[col].fillna(0)

trainData["LotFrontage"] = trainData.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x: x.fillna(x.median()))

trainData['Electrical'] = trainData['Electrical'].fillna(trainData['Electrical'].mode()[0])

print(trainData.isnull().sum().sort_values(ascending=False).head(10))

### -- `Zero Data` -- 

In [None]:
zero_value = trainData.isin([0]).sum()
zero_value = zero_value[zero_value > 0].sort_values(ascending=False)
print(zero_value.head(30))

In [None]:
trainData['KitchenAbvGr'] = trainData['KitchenAbvGr'].fillna(trainData['KitchenAbvGr'].mode()[0])

### -- `Categorical objects` -- 

In [None]:
cat_cols = trainData.select_dtypes(include=["object"]).columns
print(cat_cols)


cat_cols = trainData.select_dtypes(include=["object"]).columns

for col in cat_cols:
    print(f"{col}: {trainData[col].nunique()} унікальних значень")

trainData = pd.get_dummies(trainData, columns=cat_cols)

### -- `Outliers` -- 

In [None]:
sns.scatterplot(x=trainData['LotArea'], y=trainData['SalePrice'])
plt.title("LotArea vs SalePrice")

trainData = trainData[trainData['GrLivArea'] < 4000]
trainData = trainData[trainData['LotArea'] < 7500]

### -- `Feature Engineering` -- 

In [None]:
trainData['TotalSF'] = trainData['1stFlrSF'] + trainData['2ndFlrSF'] + trainData['TotalBsmtSF']
trainData['Age'] = trainData['YrSold'] - trainData['YearBuilt']

#### &#10140; *Preparing Data*</br>

In [None]:
trainData.to_csv("processed_train_data.csv", index=False)

In [None]:
y = trainData['SalePrice']
X = trainData.drop('SalePrice', axis=1)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=55)

In [None]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_log_error
import numpy as np

def rmsle(y_true_log, y_pred_log):
    y_true = np.expm1(y_true_log)  
    y_pred = np.expm1(y_pred_log)
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

final_model = XGBRegressor(
    colsample_bytree=0.6,
    gamma=0,
    learning_rate=0.05,
    max_depth=4,
    min_child_weight=3,
    n_estimators=1000,
    reg_alpha=0.01,
    reg_lambda=2,
    subsample=0.6,
    random_state=42
)

final_model.fit(X_train, y_train)

y_pred = final_model.predict(X_valid)

score = rmsle(y_valid, y_pred)
print(f"RMSLE: {score:.6f}")

In [None]:
testData = pd.read_csv('./data/test.csv')
testData.drop(['Id'], axis=1, inplace=True)

for col in ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu',
            'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
            'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'BsmtQual', 'BsmtCond',
            'MasVnrType']:
    testData[col] = testData[col].fillna("None")
for col in ['GarageYrBlt', 'GarageArea', 'GarageCars', 'BsmtFinSF1',
            'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 
            'BsmtHalfBath', 'MasVnrArea']:
    testData[col] = testData[col].fillna(0)
testData["LotFrontage"] = testData.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x: x.fillna(x.median()))
testData['Electrical'] = testData['Electrical'].fillna(testData['Electrical'].mode()[0])
testData['KitchenAbvGr'] = testData['KitchenAbvGr'].fillna(testData['KitchenAbvGr'].mode()[0])

cat_cols = testData.select_dtypes(include=["object"]).columns
for col in cat_cols:
    testData[col] = testData[col].fillna("None")
testData = pd.get_dummies(testData, columns=cat_cols)

X_test = testData.reindex(columns=X_train.columns, fill_value=0)
print("Test data shape: ", X_test.shape)
print("Sucssessfully prepared test data.")

In [None]:
final_model.fit(X_train, y_train)

test_predictions = final_model.predict(X_test)

submission = pd.DataFrame({
    'Id': pd.read_csv('./data/test.csv')['Id'],
    'SalePrice': np.expm1(test_predictions)
})
submission.to_csv('./data/submission1.csv', index=False)
print("Predictions saved to './data/submission.csv'")
print("Script completed successfully.")
