In [1]:
### 1: Import bibliotek i wczytanie danych
import pandas as pd
import numpy as np
df_train = pd.read_csv(r'train.csv')

In [3]:
### 2: Sprawdzenie pliku
df_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
### 3: Sprawdzenie braków
df_train.isnull().sum().sort_values(ascending = False).head(20)

PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
MasVnrType       872
FireplaceQu      690
LotFrontage      259
GarageYrBlt       81
GarageCond        81
GarageType        81
GarageFinish      81
GarageQual        81
BsmtFinType2      38
BsmtExposure      38
BsmtQual          37
BsmtCond          37
BsmtFinType1      37
MasVnrArea         8
Electrical         1
Id                 0
dtype: int64

In [7]:
### 4: Sprawdzenie ile wierszy ma ile braków
missing= df_train.isnull().sum(axis=1)
missing.value_counts().sort_index()

1       1
2       4
3      52
4     411
5     511
6     332
7      38
8       2
9       5
10     36
11     55
12      6
15      3
16      4
Name: count, dtype: int64

In [9]:
### 5: Usunięcie wierszy i kolumn z największą ilością braków
df_train = df_train[missing < 11]
df_train = df_train.drop(columns=['PoolQC','MiscFeature','Alley','Fence','MasVnrType','FireplaceQu'])

In [11]:
### 6: Ponowne sprawdzenie braków
df_train.isnull().sum().sort_values(ascending=False).head(20)

LotFrontage     249
GarageYrBlt      34
GarageCond       34
GarageType       34
GarageFinish     34
GarageQual       34
BsmtFinType2     10
BsmtExposure     10
BsmtFinType1      9
BsmtCond          9
BsmtQual          9
MasVnrArea        8
Electrical        1
WoodDeckSF        0
PavedDrive        0
LowQualFinSF      0
GrLivArea         0
BsmtFullBath      0
BsmtHalfBath      0
FullBath          0
dtype: int64

In [13]:
### 7: Wczytanie pliku test i sprawdzanie braków
df_test = pd.read_csv(r'test.csv')
df_test.isnull().sum().sort_values(ascending=False).head(35)

PoolQC          1456
MiscFeature     1408
Alley           1352
Fence           1169
MasVnrType       894
FireplaceQu      730
LotFrontage      227
GarageYrBlt       78
GarageQual        78
GarageFinish      78
GarageCond        78
GarageType        76
BsmtCond          45
BsmtQual          44
BsmtExposure      44
BsmtFinType1      42
BsmtFinType2      42
MasVnrArea        15
MSZoning           4
BsmtHalfBath       2
Utilities          2
Functional         2
BsmtFullBath       2
BsmtFinSF1         1
BsmtFinSF2         1
BsmtUnfSF          1
KitchenQual        1
TotalBsmtSF        1
Exterior2nd        1
GarageCars         1
Exterior1st        1
GarageArea         1
SaleType           1
MiscVal            0
BedroomAbvGr       0
dtype: int64

In [15]:
### 8: Usunięcie tych samych kolumn co w pliku train
df_test = df_test.drop(columns=['PoolQC','MiscFeature','Alley','Fence','MasVnrType','FireplaceQu'])

In [17]:
### 9: Uzupełnienie braków w pliku train za pomocą catboost
from catboost import CatBoostRegressor, CatBoostClassifier

def predict_missing_catboost(df, target_col, verbose=False):
    known = df[df[target_col].notnull()].copy()
    unknown = df[df[target_col].isnull()].copy()

    if known.empty or unknown.empty:
        print(f'Brak danych do nauki: {target_col}')
        return df

    unique_values = known[target_col].dropna().unique()
    if len(unique_values) == 1:
        single_value = unique_values[0]
        df[target_col] = df[target_col].fillna(single_value)
        print(f'{target_col}: tylko jedna wartość: {single_value}')
        return df

    y_train = known[target_col]
    X_train = known.drop(columns=[target_col])
    X_pred = unknown.drop(columns=[target_col])

    # Kolumny kategoryczne
    cat_cols = X_train.select_dtypes(include='object').columns.tolist()
    X_train[cat_cols] = X_train[cat_cols].fillna('missing')
    X_pred[cat_cols] = X_pred[cat_cols].fillna('missing')

    # Konwersja 'object' -> 'category'
    for col in cat_cols:
        X_train[col] = X_train[col].astype('category')
        X_pred[col] = X_pred[col].astype('category')

    # Kolumny numeryczne
    num_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
    X_train[num_cols] = X_train[num_cols].fillna(-999)
    X_pred[num_cols] = X_pred[num_cols].fillna(-999)

    # Model
    fit_params = {
        'iterations': 100,
        'depth': 6,
        'learning_rate': 0.1,
        'verbose': verbose,
        'random_state': 42
    }

    if cat_cols:
        fit_params['cat_features'] = cat_cols

    model = CatBoostClassifier(**fit_params) if df[target_col].dtype in ['object', 'category'] else CatBoostRegressor(**fit_params)

    try:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_pred)
        df.loc[df[target_col].isnull(), target_col] = y_pred
        print(f'Uzupełniono: {target_col}')
    except Exception as e:
        print(f'Błąd: {target_col}: {e}')

    return df
tabela = df_train
columns_with_missing = tabela.columns[tabela.isnull().any()].tolist()
print(f"Wykryto {len(columns_with_missing)} kolumn z brakami: {columns_with_missing}")
for col in columns_with_missing:
    tabela = predict_missing_catboost(tabela, col)

Wykryto 13 kolumn z brakami: ['LotFrontage', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond']
Uzupełniono: LotFrontage
Uzupełniono: MasVnrArea
Uzupełniono: BsmtQual
Uzupełniono: BsmtCond
Uzupełniono: BsmtExposure
Uzupełniono: BsmtFinType1
Uzupełniono: BsmtFinType2
Uzupełniono: Electrical
Uzupełniono: GarageType
Uzupełniono: GarageYrBlt
Uzupełniono: GarageFinish
Uzupełniono: GarageQual
Uzupełniono: GarageCond


In [21]:
### 10: Uzupełnienie braków w pliku test za pomocą catboost
def predict_missing_catboost(df, target_col, verbose=False):
    known = df[df[target_col].notnull()].copy()
    unknown = df[df[target_col].isnull()].copy()

    if known.empty or unknown.empty:
        print(f'Brak danych do nauki: {target_col}')
        return df

    unique_values = known[target_col].dropna().unique()
    if len(unique_values) == 1:
        single_value = unique_values[0]
        df[target_col] = df[target_col].fillna(single_value)
        print(f'{target_col}: tylko jedna wartość: {single_value}')
        return df

    y_train = known[target_col]
    X_train = known.drop(columns=[target_col])
    X_pred = unknown.drop(columns=[target_col])

    # Kolumny kategoryczne
    cat_cols = X_train.select_dtypes(include='object').columns.tolist()
    X_train[cat_cols] = X_train[cat_cols].fillna('missing')
    X_pred[cat_cols] = X_pred[cat_cols].fillna('missing')

    # Konwersja 'object' -> 'category'
    for col in cat_cols:
        X_train[col] = X_train[col].astype('category')
        X_pred[col] = X_pred[col].astype('category')

    # Kolumny numeryczne
    num_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
    X_train[num_cols] = X_train[num_cols].fillna(-999)
    X_pred[num_cols] = X_pred[num_cols].fillna(-999)

    # Model
    fit_params = {
        'iterations': 100,
        'depth': 6,
        'learning_rate': 0.1,
        'verbose': verbose,
        'random_state': 42
    }

    if cat_cols:
        fit_params['cat_features'] = cat_cols

    model = CatBoostClassifier(**fit_params) if df[target_col].dtype in ['object', 'category'] else CatBoostRegressor(**fit_params)

    try:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_pred)
        df.loc[df[target_col].isnull(), target_col] = y_pred
        print(f'Uzupełniono: {target_col}')
    except Exception as e:
        print(f'Błąd: {target_col}: {e}')

    return df
tabela = df_test
columns_with_missing = tabela.columns[tabela.isnull().any()].tolist()
print(f"Wykryto {len(columns_with_missing)} kolumn z brakami: {columns_with_missing}")
for col in columns_with_missing:
    tabela = predict_missing_catboost(tabela, col)

Wykryto 27 kolumn z brakami: ['MSZoning', 'LotFrontage', 'Utilities', 'Exterior1st', 'Exterior2nd', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'KitchenQual', 'Functional', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'SaleType']
Uzupełniono: MSZoning
Uzupełniono: LotFrontage
Utilities: tylko jedna wartość: AllPub
Uzupełniono: Exterior1st
Uzupełniono: Exterior2nd
Uzupełniono: MasVnrArea
Uzupełniono: BsmtQual
Uzupełniono: BsmtCond
Uzupełniono: BsmtExposure
Uzupełniono: BsmtFinType1
Uzupełniono: BsmtFinSF1
Uzupełniono: BsmtFinType2
Uzupełniono: BsmtFinSF2
Uzupełniono: BsmtUnfSF
Uzupełniono: TotalBsmtSF
Uzupełniono: BsmtFullBath
Uzupełniono: BsmtHalfBath
Uzupełniono: KitchenQual
Uzupełniono: Functional
Uzupełniono: GarageType
Uzupełniono: GarageYrBlt
Uzupełniono: GarageFinish
Uzupełniono: GarageCars
Uzupeł

In [29]:
### 11: Sprawdzenie typów kolumn w df_train
df_train.dtypes.value_counts()

object     37
int64      35
float64     3
Name: count, dtype: int64

In [31]:
### 12: Sprawdzenie typów kolumn w df_test
df_test.dtypes.value_counts()

object     37
int64      26
float64    11
Name: count, dtype: int64

In [33]:
### 13: Sprawdzenie które kolumny mają różne typy
diff_types = []

for col in df_train.columns:
    if col in df_test.columns:
        if df_train[col].dtype != df_test[col].dtype:
            diff_types.append((col, df_train[col].dtype, df_test[col].dtype))

# Wyświetlenie wyników
for col, dtype_train, dtype_test in diff_types:
    print(f"{col}: train={dtype_train}, test={dtype_test}")

BsmtFinSF1: train=int64, test=float64
BsmtFinSF2: train=int64, test=float64
BsmtUnfSF: train=int64, test=float64
TotalBsmtSF: train=int64, test=float64
BsmtFullBath: train=int64, test=float64
BsmtHalfBath: train=int64, test=float64
GarageCars: train=int64, test=float64
GarageArea: train=int64, test=float64


In [35]:
### 14: Standaryzacja typów kolumn
cols_to_convert = [
    'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
    'BsmtFullBath', 'BsmtHalfBath', 'GarageCars', 'GarageArea'
]

for col in cols_to_convert:
    df_test[col] = df_test[col].astype('int64')

In [37]:
### 15: Połączenie i wyświetlenie df_train i df_test
df_all = pd.concat([df_train, df_test], ignore_index=True)
df_all.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,2,2008,WD,Normal,208500.0
1,2,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,...,0,0,0,0,0,5,2007,WD,Normal,181500.0
2,3,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,...,0,0,0,0,0,9,2008,WD,Normal,223500.0
3,4,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,...,272,0,0,0,0,2,2006,WD,Abnorml,140000.0
4,5,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,...,0,0,0,0,0,12,2008,WD,Normal,250000.0


In [39]:
### 16: Sprawdzenie kolumn object
obj_cols = df_all.select_dtypes(include=['object']).columns.tolist()
print(obj_cols)

['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition']


In [65]:
### 17: Zamiana kolumn object na category i następnie OneHotEncoder
df_wsio = df_all.copy()

for col in df_wsio.select_dtypes(include='object').columns:
    df_wsio[col] = df_wsio[col].astype('category')
    
cat_cols = df_wsio.select_dtypes(include='category').columns
df_wsio = pd.get_dummies(df_wsio, columns=cat_cols, drop_first=False)
df_wsio = df_wsio.loc[:, ~df_wsio.columns.duplicated()]
df_wsio.dtypes.value_counts()

bool       229
int64       34
float64      4
Name: count, dtype: int64

In [67]:
### 18: Ponowny podział na zbiór treningowy i testowy
df_trening = df_wsio[df_wsio['Id'] < 1461].copy()
df_trening.drop(['Id'], axis=1, inplace=True)

df_testing = df_wsio[df_wsio['Id'] >= 1461].copy()
df_testing.drop(['Id', 'SalePrice'], axis=1, inplace=True)

In [77]:
### 19: Trening, predykcja i zapisanie wyników do pliku
from xgboost import XGBRegressor

# 1) Dane treningowe
X_full = df_trening.drop(columns=['SalePrice']).copy()
y_full = np.log1p(df_trening['SalePrice'].astype(float))

# Konwersja typów bool -> uint8
bool_cols = X_full.select_dtypes(include='bool').columns
if len(bool_cols):
    X_full[bool_cols] = X_full[bool_cols].astype('uint8')

# 2) Dane testowe
X_test_final = df_testing.copy()
bool_cols_test = X_test_final.select_dtypes(include='bool').columns
if len(bool_cols_test):
    X_test_final[bool_cols_test] = X_test_final[bool_cols_test].astype('uint8')

# 3) Model (parametry z najlepszego wyniku)
model = XGBRegressor(
    n_estimators=8000,
    learning_rate=0.03,
    max_depth=3,
    min_child_weight=3,
    gamma=0.1,
    subsample=0.7,
    colsample_bytree=0.7,
    reg_lambda=5,
    reg_alpha=0.1,
    objective='reg:squarederror',
    eval_metric='rmse',
    random_state=42,
    n_jobs=-1
)

# 4) Trening na CAŁYM zbiorze
model.fit(X_full, y_full, verbose=False)

# 5) Predykcja w skali log1p -> expm1 dla oryginalnych cen
y_pred_log = model.predict(X_test_final)
y_pred_prices = np.expm1(y_pred_log)

# 6) Submission
submission = pd.DataFrame({
    'Id': df_wsio.loc[df_wsio['Id'] >= 1461, 'Id'],
    'SalePrice': y_pred_prices
})

submission.to_csv('submission.csv', index=False)
print("Plik submission.csv zapisany")