In [1]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
import pandas as pd
import project_libs as libs
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import warnings


In [2]:

data = libs.read_csv_to_dataframe("data/data.csv")
df = data.copy()

shape (4600, 18)
----------------------------------------------------------------------------------------------------
List of columns
['date', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'street', 'city', 'statezip', 'country']
----------------------------------------------------------------------------------------------------
Data info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           4600 non-null   object 
 1   price          4600 non-null   float64
 2   bedrooms       4600 non-null   float64
 3   bathrooms      4600 non-null   float64
 4   sqft_living    4600 non-null   int64  
 5   sqft_lot       4600 non-null   int64  
 6   floors         4600 non-null   float64
 7   waterfront     4600 non-null   i

In [3]:
df.drop(columns=['date','country'], axis=1, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   price          4600 non-null   float64
 1   bedrooms       4600 non-null   float64
 2   bathrooms      4600 non-null   float64
 3   sqft_living    4600 non-null   int64  
 4   sqft_lot       4600 non-null   int64  
 5   floors         4600 non-null   float64
 6   waterfront     4600 non-null   int64  
 7   view           4600 non-null   int64  
 8   condition      4600 non-null   int64  
 9   sqft_above     4600 non-null   int64  
 10  sqft_basement  4600 non-null   int64  
 11  yr_built       4600 non-null   int64  
 12  yr_renovated   4600 non-null   int64  
 13  street         4600 non-null   object 
 14  city           4600 non-null   object 
 15  statezip       4600 non-null   object 
dtypes: float64(4), int64(9), object(3)
memory usage: 575.1+ KB


In [4]:
features = ['street','city','statezip']
target_column = 'price'

In [5]:
# Define the list of models to run in the pipeline
models = [
    ('Random Forest', RandomForestRegressor, {'n_estimators': 100}),
    ('Gradient Boosting', GradientBoostingRegressor, {'n_estimators': 100}),
    ('Ridge Regression', Ridge, {'alpha': 1.0}),
    ('Lasso Regression', Lasso, {'alpha': 1.0}),
    ('ElasticNet Regression', ElasticNet, {'alpha': 1.0, 'l1_ratio': 0.5}),
    ('Decision Tree', DecisionTreeRegressor, {'max_depth': 5}),
    ('Extra Trees', ExtraTreesRegressor, {'n_estimators': 100}),
    ('KNN', KNeighborsRegressor, {'n_neighbors': 5}),
    ('Gaussian Process', GaussianProcessRegressor, {'n_restarts_optimizer': 10, 'alpha': 0.1}),
    
]


In [6]:
X_train, X_test, y_train, y_test,result = libs.run_ml_pipeline(df, target_column, features, models,encoding_method='label',  use_cross_validation=False)

price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
street           0
city             0
statezip         0
dtype: int64
No missing values found. No imputation needed.
____________________________________________________________________________________________________
____________________________________________________________________________________________________
Data Frame shape (4600, 16)
((3680, 15), (920, 15), (3680,), (920,))
____________________________________________________________________________________________________
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
street           

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Model: ElasticNet Regression
Training Score: 0.53
Test Score: 0.03
----------------------------------------
Model: Decision Tree
Training Score: 0.64
Test Score: 0.02
----------------------------------------
Model: Extra Trees
Training Score: 1.00
Test Score: 0.04
----------------------------------------
Model: KNN
Training Score: 0.60
Test Score: 0.02
----------------------------------------
Model: Gaussian Process
Training Score: 0.97
Test Score: -0.33
----------------------------------------


In [8]:
result

Unnamed: 0,Model Name,Training Score,Test Score
0,Random Forest,0.945109,0.042081
1,Gradient Boosting,0.814809,0.0508
2,Ridge Regression,0.549335,0.03293
3,Lasso Regression,0.549346,0.032903
4,ElasticNet Regression,0.531793,0.033116
5,Decision Tree,0.639415,0.020934
6,Extra Trees,0.999996,0.039435
7,KNN,0.601328,0.019875
8,Gaussian Process,0.974471,-0.330156
