## Import libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
%matplotlib inline

## Import the data

In [3]:
houses = pd.read_csv('houses_to_rent.csv')

In [4]:
houses.head()

Unnamed: 0.1,Unnamed: 0,city,area,rooms,bathroom,parking spaces,floor,animal,furniture,hoa,rent amount,property tax,fire insurance,total
0,0,1,240,3,3,4,-,acept,furnished,R$0,"R$8,000","R$1,000",R$121,"R$9,121"
1,1,0,64,2,1,1,10,acept,not furnished,R$540,R$820,R$122,R$11,"R$1,493"
2,2,1,443,5,5,4,3,acept,furnished,"R$4,172","R$7,000","R$1,417",R$89,"R$12,680"
3,3,1,73,2,2,1,12,acept,not furnished,R$700,"R$1,250",R$150,R$16,"R$2,116"
4,4,1,19,1,1,0,-,not acept,not furnished,R$0,"R$1,200",R$41,R$16,"R$1,257"


## Clean the data

In [5]:
# replace the R$'s
import re
replace_col = ['total','hoa','rent amount','property tax', 'fire insurance']

#clean chars and convers to int
for replaces in replace_col:
  houses[replaces].replace(to_replace="[^0-9]", value="", regex=True, inplace=True)
  houses[replaces] = pd.to_numeric(houses[replaces])

In [6]:
#convert the "-" from floor to zero
houses['floor'].replace(to_replace="[^0-9]", value=0, regex=True, inplace=True)
houses['floor'] = houses['floor'].astype(int)

In [7]:
#drop the first column
houses.drop('Unnamed: 0',axis = 1,inplace = True)

In [8]:
houses.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6080 entries, 0 to 6079
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   city            6080 non-null   int64  
 1   area            6080 non-null   int64  
 2   rooms           6080 non-null   int64  
 3   bathroom        6080 non-null   int64  
 4   parking spaces  6080 non-null   int64  
 5   floor           6080 non-null   int64  
 6   animal          6080 non-null   object 
 7   furniture       6080 non-null   object 
 8   hoa             5878 non-null   float64
 9   rent amount     6080 non-null   int64  
 10  property tax    6053 non-null   float64
 11  fire insurance  6080 non-null   int64  
 12  total           6080 non-null   int64  
dtypes: float64(2), int64(9), object(2)
memory usage: 617.6+ KB


# Missing Values

In [10]:
houses.isnull().mean() * 100

city              0.000000
area              0.000000
rooms             0.000000
bathroom          0.000000
parking spaces    0.000000
floor             0.000000
animal            0.000000
furniture         0.000000
hoa               3.322368
rent amount       0.000000
property tax      0.444079
fire insurance    0.000000
total             0.000000
dtype: float64

## Get Dummies

In [11]:
houses = pd.get_dummies(houses, drop_first=True)

### Multiple Imputations by Chained Equations(MICE)

In [12]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

MICE_imputer = IterativeImputer()
houses.iloc[:, :] = MICE_imputer.fit_transform(houses)

# Clean the data II

In [13]:
## remove duplicate rows
houses = houses.drop_duplicates(keep=False)

In [14]:
## since the area is skewed, let's break the area into groups
houses['area_groups'] =  pd.cut(houses.area, [5,30,60,100,200,400,25000], include_lowest=True)


In [15]:
## since the hoa is skewed, let's break the area into groups
houses['hoa'] = np.where(houses['hoa'] < 0, 0, houses['hoa'])
houses['hoa_groups'] = pd.cut(houses.hoa, [-100,250,700,1200,1500,3500,4000,220000], include_lowest=True)

In [16]:
## since the property tax is skewed, let's break the area into groups
houses['property tax'] = np.where(houses['property tax'] < 0, 0, houses['property tax'])
houses['prop_tax_groups'] = pd.cut(houses['property tax'], [-100,35,147,459,1467,366300], include_lowest=True)

## Scaling

In [17]:
# get dummies again
houses = pd.get_dummies(houses, drop_first=True)

X = houses.drop('total', axis=1).values
y = houses['total'].values

In [18]:
from sklearn.preprocessing import StandardScaler

st_scale = StandardScaler()
X = st_scale.fit_transform(X)

## Train - Test sep


In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = .33, random_state=0)

## Import & Define the Models


In [52]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train,y_train)

LinearRegression()

In [54]:
pred = lr.predict(X_test)
pred

array([2476.22128626, 2143.06825994, 3620.28633418, ..., 7845.30232713,
       9149.37911136, 3775.56483998])

## Predict with Pycaret

In [46]:
from pycaret.regression import *
exp_reg102 = setup(data = houses, target = 'total', session_id=123,
                  normalize = True, transformation = True, transform_target = True, 
                  combine_rare_levels = True, rare_level_threshold = 0.05,
                  remove_multicollinearity = True, multicollinearity_threshold = 0.95) 
                  


Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,123
1,Transform Target,True
2,Transform Target Method,box-cox
3,Original Data,"(5755, 28)"
4,Missing Values,False
5,Numeric Features,24
6,Categorical Features,3
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


In [47]:
compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Gradient Boosting Regressor,315.5464,43625220.3048,3316.4994,0.8204,0.0625,0.0266
1,Extra Trees Regressor,265.4558,44846950.0343,3316.2773,0.8162,0.0593,0.0161
2,Random Forest,277.2476,44903673.4752,3328.554,0.8162,0.0636,0.0182
3,CatBoost Regressor,259.2803,44088232.1811,3430.6596,0.8121,0.0556,0.0145
4,Support Vector Machine,436.5553,44886515.1856,3446.1253,0.8119,0.0812,0.0432
5,Light Gradient Boosting Machine,308.8527,44392861.8469,3453.0368,0.8112,0.0643,0.0199
6,Extreme Gradient Boosting,329.3873,43619821.887,3435.1727,0.8094,0.0662,0.0283
7,Huber Regressor,587.3521,44897998.9387,3574.1021,0.8057,0.1278,0.076
8,Random Sample Consensus,660.7238,45042967.3238,3666.4528,0.7984,0.1261,0.0799
9,TheilSen Regressor,679.4514,45086964.9743,3697.4173,0.796,0.1274,0.08


In [49]:
tuned_extra_tree = tune_model('et')

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,212.8554,3107122.0,1762.7031,0.8923,0.0582,0.0187
1,94.2556,37303.15,193.1402,0.998,0.0207,0.0147
2,672.3558,114986300.0,10723.1653,0.2413,0.1325,0.0191
3,102.7386,49976.26,223.5537,0.9976,0.0253,0.0155
4,129.8522,121915.9,349.1645,0.9947,0.0287,0.0167
5,126.8153,173267.3,416.2539,0.9926,0.0297,0.0156
6,1022.8964,330190100.0,18171.1327,0.0687,0.195,0.0195
7,125.1912,266519.3,516.2551,0.9884,0.0458,0.0174
8,132.1677,254741.2,504.7189,0.9896,0.0298,0.0155
9,104.2024,78240.7,279.7154,0.9961,0.0324,0.017


In [50]:
plot_model(tuned_extra_tree, plot = 'parameter')


Unnamed: 0,Parameters
bootstrap,False
ccp_alpha,0
criterion,mse
max_depth,70
max_features,auto
max_leaf_nodes,
max_samples,
min_impurity_decrease,0
min_impurity_split,
min_samples_leaf,2
