In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression

In [3]:
path = '../../dataset/houses_to_rent_v2.zip'
house_rent_df = pd.read_csv(path)

In [4]:
# original column name is suck, let's rename it
house_rent_df = house_rent_df.rename(columns={
    'parking spaces': 'parking_spaces',
    'hoa (R$)': 'homeowners_association',
    'rent amount (R$)': 'rent_amount',
    'property tax (R$)': 'property_tax',
    'fire insurance (R$)': 'fire_insurance',
    'total (R$)': 'total_cost'
})

In [5]:
house_rent_df.head()

Unnamed: 0,city,area,rooms,bathroom,parking_spaces,floor,animal,furniture,homeowners_association,rent_amount,property_tax,fire_insurance,total_cost
0,São Paulo,70,2,1,1,7,acept,furnished,2065,3300,211,42,5618
1,São Paulo,320,4,4,0,20,acept,not furnished,1200,4960,1750,63,7973
2,Porto Alegre,80,1,1,1,6,acept,not furnished,1000,2800,0,41,3841
3,Porto Alegre,51,2,1,0,2,acept,not furnished,270,1112,22,17,1421
4,São Paulo,25,1,1,0,1,not acept,not furnished,0,800,25,11,836


In [6]:
# transform string into Decoded Columns
one_hot_columns = ['city', 'animal', 'furniture']

encoder = OneHotEncoder(sparse_output=False)   # make sure we get a NumPy array

# Fit and transform the categorical columns
encoded_array = encoder.fit_transform(house_rent_df[one_hot_columns])

# Get the new column names
encoded_cols = encoder.get_feature_names_out(one_hot_columns)

# Now shapes will match: encoded_array.shape[1] == len(encoded_cols)
encoded_df = pd.DataFrame(encoded_array, columns=encoded_cols, index=house_rent_df.index)

# Merge back into original df
house_rent_df = pd.concat([house_rent_df.drop(columns=one_hot_columns), encoded_df], axis=1)

In [7]:
house_rent_df['floor'] = house_rent_df['floor'].replace({'-': 0 }).astype(int) # remove - symbol

In [8]:
house_rent_df.head(10)

Unnamed: 0,area,rooms,bathroom,parking_spaces,floor,homeowners_association,rent_amount,property_tax,fire_insurance,total_cost,city_Belo Horizonte,city_Campinas,city_Porto Alegre,city_Rio de Janeiro,city_São Paulo,animal_acept,animal_not acept,furniture_furnished,furniture_not furnished
0,70,2,1,1,7,2065,3300,211,42,5618,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
1,320,4,4,0,20,1200,4960,1750,63,7973,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
2,80,1,1,1,6,1000,2800,0,41,3841,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
3,51,2,1,0,2,270,1112,22,17,1421,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
4,25,1,1,0,1,0,800,25,11,836,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
5,376,3,3,7,0,0,8000,834,121,8955,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
6,72,2,1,0,7,740,1900,85,25,2750,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
7,213,4,4,4,4,2254,3223,1735,41,7253,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
8,152,2,2,1,3,1000,15000,250,191,16440,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
9,35,1,1,0,2,590,2300,35,30,2955,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    house_rent_df.drop(columns=['total_cost']),
    house_rent_df['total_cost'],
    test_size=0.2,
    random_state=42)

In [10]:
model = LinearRegression()

model.fit(X_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [11]:
y_train_pred = model.predict(X_train)
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
rmse_train

np.float64(6.81268260894009)

In [12]:
y_test_pred = model.predict(X_test)
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
rmse_test

np.float64(1.0475802326467942)

In [19]:
score = cross_val_score(model, X_test, y_test, scoring='neg_root_mean_squared_error', cv=5)
score = -score.mean()
score

np.float64(1.061746216168309)

In [21]:
score = cross_val_score(model, X_train, y_train, scoring='neg_root_mean_squared_error', cv=5)
score = -score.mean()
score

np.float64(5.139738664568823)

In [15]:
# let's predict one row and compare with real value
some_data = X_test.iloc[:1]
some_label = y_test.iloc[:1]

print("Predicted:", model.predict(some_data))
print("Label:", list(some_label))

Predicted: [3708.83412627]
Label: [3709]


In [16]:
# let's predict all test set and compare with real values and compare error % in separate column
y_pred = model.predict(X_test)
results_df = pd.DataFrame({'Predicted': y_pred, 'Actual': y_test})
results_df['Error'] = results_df['Predicted'] - results_df['Actual']
results_df['Error %'] = (results_df['Error'] / results_df['Actual']).abs() * 100
results_df

Unnamed: 0,Predicted,Actual,Error,Error %
1964,3708.834126,3709,-0.165874,0.004472
5267,2634.739430,2635,-0.260570,0.009889
9630,2919.978901,2920,-0.021099,0.000723
5503,1730.372032,1730,0.372032,0.021505
1683,22025.905817,22030,-4.094183,0.018585
...,...,...,...,...
1103,4862.192659,4862,0.192659,0.003963
10165,1907.963701,1908,-0.036299,0.001902
4845,1585.802458,1586,-0.197542,0.012455
5603,3801.942161,3802,-0.057839,0.001521


In [17]:
results_df.to_csv('house_rent_predictions.csv', index=False)