# Pycaret test

In [1]:
import pandas as pd

#____loading data____
df = pd.read_csv('../data/clean_house.csv')
# df = pd.read_csv('./data/clean_app.csv')

In [2]:
# ____Outliers handling_____
import numpy as np 

Q1 = df["price"].quantile(0.25)
Q3 = df["price"].quantile(0.75)
IQR = Q3 - Q1

max_value = Q3 + (1.5 * IQR)
min_value = Q1 - (1.5 * IQR)

outliers_mask = (df["price"] < min_value) | (df["price"] > max_value)
df.loc[outliers_mask, "price"] = np.nan

df.dropna(subset=["price"], inplace=True)

# preprocessing

In [3]:
# _____dropping unnecessary columns____
# for house and app
columns_to_drop = ['property_id', 'latitude', 'longitude', 'property_type', 'type_of_sale', 'fully_equipped_kitchen', 'locality_name', 'main_city']

df.drop(columns_to_drop, axis=1, inplace=True)

In [4]:
# for app only

# app_columns_to_drop = ['surface_of_good']

# df.drop(app_columns_to_drop, axis=1, inplace=True)

In [5]:
# _____imputing missing values for swimmingpool____
from sklearn.impute import SimpleImputer
import numpy as np

constant_imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
df[['swimming_pool']] = constant_imputer.fit_transform(df[['swimming_pool']])

In [6]:
# _____one-hot encoding for kitchen_type____
df = pd.get_dummies(df, columns=["kitchen_type"], prefix="kitchen_type")

# _____one-hot encoding for state_of_building____
df = pd.get_dummies(df, columns=["state_of_building"], prefix="state_of_building")

# _____one-hot encoding for property_subtype____
df = pd.get_dummies(df, columns=["property_subtype"], prefix="property_subtype")

# _____one-hot encoding for province____
df = pd.get_dummies(df, columns=["province"], prefix="province")

# Creating a training and test set

In [7]:
# _____defining target and features____

columns_to_drop = ['price']

# Drop the specified columns
X = df.drop(columns=columns_to_drop, axis=1)

y = df['price']

In [8]:
from pycaret.regression import RegressionExperiment
s = RegressionExperiment()
s.setup(X, target = y, session_id = 120, numeric_imputation =-1)

Unnamed: 0,Description,Value
0,Session id,120
1,Target,price
2,Target type,Regression
3,Original data shape,"(10763, 51)"
4,Transformed data shape,"(10763, 51)"
5,Transformed train set shape,"(7534, 51)"
6,Transformed test set shape,"(3229, 51)"
7,Numeric features,12
8,Rows with missing values,99.9%
9,Preprocess,True


<pycaret.regression.oop.RegressionExperiment at 0x7f0b57b238e0>

In [9]:
best = s.compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
rf,Random Forest Regressor,70656.9658,9046507415.4225,95069.8531,0.6822,0.2813,0.2318,0.299
gbr,Gradient Boosting Regressor,72892.8664,9395499721.3778,96902.0942,0.6697,0.2854,0.2382,0.132
et,Extra Trees Regressor,71264.9496,9515556684.7252,97526.4828,0.6656,0.2856,0.2315,0.328
knn,K Neighbors Regressor,94680.0484,15361625088.0,123892.8641,0.4614,0.3579,0.3101,0.017
ada,AdaBoost Regressor,110786.8846,17789137830.8933,133280.752,0.3779,0.4305,0.4376,0.099
dt,Decision Tree Regressor,95666.935,17785167829.7432,133294.875,0.3746,0.39,0.3007,0.015
huber,Huber Regressor,109888.4368,20974639690.5894,144780.4452,0.2645,0.4106,0.3552,0.091
en,Elastic Net,99487.0511,41858259627.7914,166665.9572,-0.537,0.3766,0.3416,0.013
br,Bayesian Ridge,83998.3635,42937816617.5941,154381.3887,-0.5899,0.3157,0.2688,0.012
ridge,Ridge Regression,84034.2814,43290283973.5096,154710.2549,-0.6032,0.3159,0.2688,0.011


Processing:   0%|          | 0/77 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
s.evaluate_model(best)

In [None]:
s.plot_model(best, plot = 'residuals')

In [None]:
s.plot_model(best, plot = 'feature')

---
---