In [29]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np
import zipfile

In [2]:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = RandomForestRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

house_prices_file_path = r'C:\Users\sasha\Downloads\house-prices-advanced-regression-techniques\train.csv'
house_prices_data = pd.read_csv(house_prices_file_path) 

y = house_prices_data.SalePrice
X = house_prices_data
X = X.drop(columns=X.columns[-1])

num_features = X.select_dtypes(include=['int64', 'float64']).columns
cat_features = X.select_dtypes(include=['object', 'category']).columns

for col in cat_features:
    X[col] = LabelEncoder().fit_transform(X[col])

for col in num_features:
    X[col + "_was_missing"] = X[col].isna().astype(int)  # Создаем бинарный столбец
    X[col].fillna(X[col].median(), inplace=True)  # Заполняем пропуски

train_X, val_X, train_y, val_y = train_test_split(X, y,random_state = 0)


In [7]:
# compare MAE with differing values of max_leaf_nodes
for max_leaf_nodes in [5, 50, 500, 5000, 50000]:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))
# forest_model = RandomForestRegressor(random_state=1)
# forest_model.fit(train_X, train_y)
# melb_preds = forest_model.predict(val_X)
# print(mean_absolute_error(val_y, melb_preds))

Max leaf nodes: 5  		 Mean Absolute Error:  28648
Max leaf nodes: 50  		 Mean Absolute Error:  18120
Max leaf nodes: 500  		 Mean Absolute Error:  17074
Max leaf nodes: 5000  		 Mean Absolute Error:  17069
Max leaf nodes: 50000  		 Mean Absolute Error:  17069


In [17]:
for max_leaf_nodes in range(100,500):
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))


Max leaf nodes: 100  		 Mean Absolute Error:  17262
Max leaf nodes: 101  		 Mean Absolute Error:  17253
Max leaf nodes: 102  		 Mean Absolute Error:  17261
Max leaf nodes: 103  		 Mean Absolute Error:  17246
Max leaf nodes: 104  		 Mean Absolute Error:  17238
Max leaf nodes: 105  		 Mean Absolute Error:  17231
Max leaf nodes: 106  		 Mean Absolute Error:  17210
Max leaf nodes: 107  		 Mean Absolute Error:  17195
Max leaf nodes: 108  		 Mean Absolute Error:  17195
Max leaf nodes: 109  		 Mean Absolute Error:  17183
Max leaf nodes: 110  		 Mean Absolute Error:  17177
Max leaf nodes: 111  		 Mean Absolute Error:  17171
Max leaf nodes: 112  		 Mean Absolute Error:  17159
Max leaf nodes: 113  		 Mean Absolute Error:  17156
Max leaf nodes: 114  		 Mean Absolute Error:  17135
Max leaf nodes: 115  		 Mean Absolute Error:  17130
Max leaf nodes: 116  		 Mean Absolute Error:  17139
Max leaf nodes: 117  		 Mean Absolute Error:  17138
Max leaf nodes: 118  		 Mean Absolute Error:  17139
Max leaf nod

KeyboardInterrupt: 

In [9]:
def get_mae_depth(max_depth, train_X, val_X, train_y, val_y):
    model = RandomForestRegressor(max_depth=max_depth, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

for max_depth in [1, 2, 3, 10, 20]:
    my_mae = get_mae_depth(max_depth, train_X, val_X, train_y, val_y)
    print("Max depth: %d  \t\t Mean Absolute Error:  %d" %(max_depth, my_mae))
# forest_model = RandomForestRegressor(random_state=1)
# forest_model.fit(train_X, train_y)
# melb_preds = forest_model.predict(val_X)
# print(mean_absolute_error(val_y, melb_preds))

Max depth: 1  		 Mean Absolute Error:  38003
Max depth: 2  		 Mean Absolute Error:  30097
Max depth: 3  		 Mean Absolute Error:  25177
Max depth: 10  		 Mean Absolute Error:  16930
Max depth: 20  		 Mean Absolute Error:  17003


In [13]:
mae_depth = []
for max_depth in range(7,40):
    my_mae = get_mae_depth(max_depth, train_X, val_X, train_y, val_y)
    mae_depth.append([max_depth, my_mae])
    print("Max depth: %d  \t\t Mean Absolute Error:  %d" %(max_depth, my_mae))
    print(mae_depth)
# forest_model = RandomForestRegressor(random_state=1)
# forest_model.fit(train_X, train_y)
# melb_preds = forest_model.predict(val_X)
# print(mean_absolute_error(val_y, melb_preds))

Max depth: 7  		 Mean Absolute Error:  17737
[[7, 17737.70149843917]]
Max depth: 8  		 Mean Absolute Error:  17380
[[7, 17737.70149843917], [8, 17380.70975571814]]
Max depth: 9  		 Mean Absolute Error:  17228
[[7, 17737.70149843917], [8, 17380.70975571814], [9, 17228.890926939337]]
Max depth: 10  		 Mean Absolute Error:  16930
[[7, 17737.70149843917], [8, 17380.70975571814], [9, 17228.890926939337], [10, 16930.663404447092]]
Max depth: 11  		 Mean Absolute Error:  17031
[[7, 17737.70149843917], [8, 17380.70975571814], [9, 17228.890926939337], [10, 16930.663404447092], [11, 17031.828645967566]]
Max depth: 12  		 Mean Absolute Error:  17020
[[7, 17737.70149843917], [8, 17380.70975571814], [9, 17228.890926939337], [10, 16930.663404447092], [11, 17031.828645967566], [12, 17020.324447416842]]
Max depth: 13  		 Mean Absolute Error:  16877
[[7, 17737.70149843917], [8, 17380.70975571814], [9, 17228.890926939337], [10, 16930.663404447092], [11, 17031.828645967566], [12, 17020.324447416842], [13

KeyboardInterrupt: 

In [21]:
max_leaf_nodes = 150
forest_model = RandomForestRegressor(random_state=0, max_leaf_nodes = max_leaf_nodes)
forest_model.fit(train_X, train_y)

house_prices_test_file_path = r'C:\Users\sasha\Downloads\house-prices-advanced-regression-techniques\test.csv'
house_prices_test_data = pd.read_csv(house_prices_test_file_path)

# y_test = house_prices_test_data.SalePrice
X_test = house_prices_test_data
# X_test = X.drop(columns=X.columns[-1])

num_features = X_test.select_dtypes(include=['int64', 'float64']).columns
cat_features = X_test.select_dtypes(include=['object', 'category']).columns

for col in cat_features:
    X_test[col] = LabelEncoder().fit_transform(X_test[col])

for col in num_features:
    X_test[col + "_was_missing"] = X_test[col].isna().astype(int)  # Создаем бинарный столбец
    X_test[col].fillna(X_test[col].median(), inplace=True)  # Заполняем пропуски
    
predict = forest_model.predict(X_test)

In [22]:
type(predict)

numpy.ndarray

In [30]:
df = pd.DataFrame({
    "Id": np.arange(1461, 1460 + len(predict) + 1),
    "SalePrice": predict
})

# Сохраняем в CSV
csv_filename = "submission.csv"
df.to_csv(csv_filename, index=False)
zip_filename = "submission.zip"
with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
    zipf.write(csv_filename)