In [1]:
import pyarrow
import pandas as pd
from pycaret.regression import *

In [2]:
# Load
df = pd.read_parquet('../data/after_2008_crisis.parquet', engine='pyarrow')
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6076289 entries, 0 to 6076288
Data columns (total 9 columns):
 #   Column            Dtype         
---  ------            -----         
 0   price             int64         
 1   date_of_transfer  datetime64[ns]
 2   property_type     object        
 3   is_new            bool          
 4   duration          object        
 5   city              object        
 6   district          object        
 7   county            object        
 8   ppdcategory_type  object        
dtypes: bool(1), datetime64[ns](1), int64(1), object(6)
memory usage: 376.7+ MB


In [3]:
# Initialize setup
exp_clf = setup(data=df, target='price', verbose=False, normalize=True, n_jobs = 1)

In [4]:
models()

Unnamed: 0_level_0,Name,Reference,Turbo
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lr,Linear Regression,sklearn.linear_model._base.LinearRegression,True
lasso,Lasso Regression,sklearn.linear_model._coordinate_descent.Lasso,True
ridge,Ridge Regression,sklearn.linear_model._ridge.Ridge,True
en,Elastic Net,sklearn.linear_model._coordinate_descent.Elast...,True
lar,Least Angle Regression,sklearn.linear_model._least_angle.Lars,True
llar,Lasso Least Angle Regression,sklearn.linear_model._least_angle.LassoLars,True
omp,Orthogonal Matching Pursuit,sklearn.linear_model._omp.OrthogonalMatchingPu...,True
br,Bayesian Ridge,sklearn.linear_model._bayes.BayesianRidge,True
ard,Automatic Relevance Determination,sklearn.linear_model._bayes.ARDRegression,False
par,Passive Aggressive Regressor,sklearn.linear_model._passive_aggressive.Passi...,True


In [None]:
# Compare models
# options:
# budget_time = 300 //in seconds
try:
    best_model = compare_models(fold=5, include = ['lr', 'lasso', 'ridge', 'lightgbm', 'ada', 'lar', 'par', 'knn'])
except ValueError as e:
    print(e)
    best_model = None

print(best_model)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,45390.1046,3989541161.7193,63162.7605,0.5857,0.3642,0.4159,45.39
lr,Linear Regression,49928.3218,4585506933.336,67716.3029,0.5238,0.3913,0.4605,22.012
lasso,Lasso Regression,49928.3772,4585505667.0658,67716.2936,0.5238,0.3913,0.4604,177.746
ridge,Ridge Regression,49928.3558,4585505710.7676,67716.2939,0.5238,0.3913,0.4605,21.062
lar,Least Angle Regression,49928.3554,4585505713.0378,67716.2939,0.5238,0.3913,0.4605,20.858
par,Passive Aggressive Regressor,49218.7465,4722507672.0924,68720.4572,0.5096,0.3882,0.4145,33.914
ada,AdaBoost Regressor,56590.8108,5388218160.7603,73403.4916,0.4405,0.4458,0.6025,126.844


Processing:   0%|          | 0/37 [00:00<?, ?it/s]

In [None]:
# Use Linear Regression if no valid model is found
if best_model is not None:
    tuned_model = tune_model(best_model)

    # Predict and evaluate
    predictions = predict_model(tuned_model)
    evaluate_model(tuned_model)

    # Save the model
    save_model(tuned_model, 'model')
else:
    print("No valid model found, using Linear Regression instead.")
    # Initialize regression setup
    exp_reg = setup(data=df, target='target', verbose=True, normalize=True)
    
    # Create and tune Linear Regression model
    lr_model = create_model('lr')
    tuned_lr_model = tune_model(lr_model)

    # Predict and evaluate
    predictions = predict_model(tuned_lr_model)
    evaluate_model(tuned_lr_model)

    # Save the Linear Regression model
    save_model(tuned_lr_model, 'linear_regression_model')


Fitting 10 folds for each of 2 candidates, totalling 20 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Linear Regression,0.0,0.0,0.0,1.0,0.0001,0.0


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

Transformation Pipeline and Model Successfully Saved


In [28]:
# Prepare new data for prediction
new_data = pd.DataFrame({
    'name': ['David', 'Eva'],
    'age': [28, 22],
})

# Make predictions
predictions = predict_model(tuned_model, data=new_data)
print(predictions)


    name  age  prediction_label
0  David   28          0.580868
1    Eva   22          0.478918
