In [101]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

import pandas as pd
import numpy as np

In [102]:
data = pd.read_csv('data_for_model.csv')
data

Unnamed: 0,link,post_time,is_apartment,floor,floors_count,rooms,total_meters,price_per_m2,price,year_of_construction,...,window_view_Во двор,window_view_На улицу,window_view_На улицу и двор,district_Дзержинский,district_Индустриальный,district_Кировский,district_Ленинский,district_Мотовилихинский,district_Орджоникидзевский,district_Свердловский
0,https://perm.cian.ru/sale/flat/293792898/,0,False,2,5,1.0,32.0,109375,3500000,1973.0,...,True,False,False,False,True,False,False,False,False,False
1,https://perm.cian.ru/sale/flat/292418262/,0,False,2,27,1.0,27.0,92592,2500000,2016.0,...,True,False,False,False,False,False,False,False,True,False
2,https://perm.cian.ru/sale/flat/287711385/,0,False,3,9,1.0,25.7,141634,3640000,2023.0,...,False,True,False,False,False,False,False,True,False,False
3,https://perm.cian.ru/sale/flat/292754714/,0,False,1,16,1.0,37.7,98143,3700000,2011.0,...,True,False,False,True,False,False,False,False,False,False
4,https://perm.cian.ru/sale/flat/289457307/,0,False,8,9,1.0,28.0,75714,2120000,1991.0,...,False,True,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3143,https://perm.cian.ru/sale/flat/294005202/,1,False,2,5,4.0,61.4,83876,5150000,1973.0,...,False,False,True,False,False,False,False,False,False,True
3144,https://perm.cian.ru/sale/flat/292951362/,21,False,1,5,4.0,62.0,83870,5200000,1968.0,...,True,False,False,False,False,False,False,False,False,True
3145,https://perm.cian.ru/sale/flat/293782136/,8,False,8,9,1.0,43.4,142857,6200000,2014.0,...,False,True,False,True,False,False,False,False,False,False
3146,https://perm.cian.ru/sale/flat/292988411/,24,False,7,16,4.0,81.2,81280,6600000,1992.0,...,False,False,True,False,False,False,False,True,False,False


In [103]:
X = data.drop(columns=['link', 'price'])
y = data['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [105]:
# Initialize CatBoost Regressor
cat_model = CatBoostRegressor(
    loss_function='MAE', 
    iterations=2000, 
    learning_rate=0.05, 
    depth=3, 
    verbose=200
)

# Define the k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Lists to store results
mae_scores = []

# Loop through each fold
for train_index, val_index in kf.split(X_train):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    # Train the model on the current fold
    cat_model.fit(X_train_fold, y_train_fold, eval_set=(X_val_fold, y_val_fold), early_stopping_rounds=100)
    
    # Predict on the validation data
    y_pred = cat_model.predict(X_val_fold)
    
    # Calculate MAE for the current fold
    mae = mean_absolute_error(y_val_fold, y_pred)
    mae_scores.append(mae)

# Calculate the average MAE across all folds
average_mae = np.mean(mae_scores)

print(f"Average MAE across all folds: {average_mae:.2f}")

0:	learn: 2284249.1993347	test: 2200976.7409118	best: 2200976.7409118 (0)	total: 11.5ms	remaining: 22.9s
200:	learn: 199512.5595117	test: 212690.5217280	best: 212690.5217280 (200)	total: 335ms	remaining: 2.99s
400:	learn: 151294.7433845	test: 183193.2471688	best: 183193.2471688 (400)	total: 650ms	remaining: 2.59s
600:	learn: 122379.1482499	test: 169093.3811374	best: 169093.3811374 (600)	total: 888ms	remaining: 2.07s
800:	learn: 106904.5515479	test: 160468.1327769	best: 160460.4343683 (799)	total: 1.13s	remaining: 1.68s
1000:	learn: 98934.7452210	test: 155238.8554572	best: 155238.8554572 (1000)	total: 1.33s	remaining: 1.33s
1200:	learn: 91576.6589176	test: 152032.1189922	best: 152032.1189922 (1200)	total: 1.54s	remaining: 1.02s
1400:	learn: 85877.9751893	test: 149796.7665032	best: 149785.8273229 (1398)	total: 1.76s	remaining: 753ms
1600:	learn: 82170.0512585	test: 148718.0656358	best: 148718.0656358 (1600)	total: 1.98s	remaining: 494ms
1800:	learn: 78416.5249798	test: 147333.1916521	bes

In [117]:
threshold_percentage = 10  # e.g., 10%

# Predict prices for the entire dataset
predicted_prices = cat_model.predict(X)

# Calculate the difference between predicted and actual prices in terms of percentage
price_difference_percentage = (predicted_prices - y) / y * 100

# Find indices where the real price is below the predicted price by more than the threshold
underpriced_mask = price_difference_percentage > threshold_percentage

# Construct a DataFrame with the results
results_df = pd.DataFrame({
    'Link': data['link'][underpriced_mask],
    'Actual Price': y[underpriced_mask],
    'Predicted Price': predicted_prices[underpriced_mask],
    'Difference (%)': price_difference_percentage[underpriced_mask]
})

# Display or save the results
results_df

Unnamed: 0,Link,Actual Price,Predicted Price,Difference (%)
322,https://perm.cian.ru/sale/flat/286135070/,7459300,8576059.0,14.971363
325,https://perm.cian.ru/sale/flat/287479058/,11750000,13172390.0,12.105424
338,https://perm.cian.ru/sale/flat/288974730/,2100000,2314904.0,10.233518
647,https://perm.cian.ru/sale/flat/285871513/,7759800,8676518.0,11.813682
1772,https://perm.cian.ru/sale/flat/270457443/,3100000,4499586.0,45.147921
2161,https://perm.cian.ru/sale/flat/284978757/,700000,2414789.0,244.969875
2163,https://perm.cian.ru/sale/flat/292045196/,2490000,2932421.0,17.767928
3019,https://perm.cian.ru/sale/flat/290782592/,1300000,1687640.0,29.818429
3034,https://perm.cian.ru/sale/flat/293311437/,1690000,1922159.0,13.737241
3123,https://perm.cian.ru/sale/flat/293171734/,15000000,16858160.0,12.387733
