In [None]:
# TODO: checkk book and provide a list of improvements

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression

In [None]:
path = '../../dataset/houses_to_rent_v2.zip'
house_rent_df = pd.read_csv(path)

In [None]:
# original column name is suck, let's rename it
house_rent_df = house_rent_df.rename(columns={
    'parking spaces': 'parking_spaces',
    'hoa (R$)': 'homeowners_association',
    'rent amount (R$)': 'rent_amount',
    'property tax (R$)': 'property_tax',
    'fire insurance (R$)': 'fire_insurance',
    'total (R$)': 'total_cost'
})

In [None]:
house_rent_df.head()

In [None]:
# transform string into Decoded Columns
one_hot_columns = ['city', 'animal', 'furniture']

encoder = OneHotEncoder(sparse_output=False)   # make sure we get a NumPy array

# Fit and transform the categorical columns
encoded_array = encoder.fit_transform(house_rent_df[one_hot_columns])

# Get the new column names
encoded_cols = encoder.get_feature_names_out(one_hot_columns)

# Now shapes will match: encoded_array.shape[1] == len(encoded_cols)
encoded_df = pd.DataFrame(encoded_array, columns=encoded_cols, index=house_rent_df.index)

# Merge back into original df
house_rent_df = pd.concat([house_rent_df.drop(columns=one_hot_columns), encoded_df], axis=1)

In [None]:
house_rent_df['floor'] = house_rent_df['floor'].replace({'-': 0 }).astype(int) # remove - symbol
house_rent_df = house_rent_df[house_rent_df['floor'] <= 51] # remove outliers

In [None]:
house_rent_df.describe()

In [None]:
# area -> detected outliers -> remove all  > 2000
# Check rooms column - has 13 rooms  - Ok
# bathroom - check 10 bath rooms - Ok
# parking spaces - check 12 value - Ok
# floor - 51 Ok!
# HOA ? what is this > Homeowners Association rename columns -> remove all > 15000
# rent amount - min 450 / max 4500 -> need to Scale!
# property tax - 313700 - seems outlier! -> remove all > 12500 -> Scale!
# fire insurance 677 - seems outlier! -> Scaling or use IQR to detect outliers ? Scale!
# check total max value ?

In [None]:
# Area colum has to large values -> remove all > 2000
house_rent_df = house_rent_df[house_rent_df['area'] <= 2000]
# Homeowners Association has to large values -> remove all > 15000
house_rent_df = house_rent_df[house_rent_df['homeowners_association'] <= 15000]
# property tax has to large values -> remove all > 12500
house_rent_df = house_rent_df[house_rent_df['property_tax'] <= 12500]
# total_cost has to large values -> remove all > 54430
house_rent_df = house_rent_df[house_rent_df['total_cost'] < 54430]

In [None]:
# Feature engineering

# create columns is_house if floor is 0
house_rent_df['is_house'] = house_rent_df['floor'] == 0

# if hoa is 0 there are no_hoa
house_rent_df['no_hoa_fee'] = house_rent_df['homeowners_association'] == 0

# # if property tax is 0 , there are column no_property_tax
house_rent_df['property_tax'] = house_rent_df['property_tax'] == 0

# using column total we can create total_category , temp column to stratify datasets.
house_rent_df['total_category'] = pd.cut(house_rent_df['total_cost'], bins=5, labels=range(5))

# let's create rooms_per_area column
house_rent_df['rooms_per_area'] = house_rent_df['rooms'] / house_rent_df['area']

# bathrooms_per_area
house_rent_df['bathrooms_per_area'] = house_rent_df['bathroom'] / house_rent_df['area']

# parking_spaces_per_area
house_rent_df['parking_spaces_per_area'] = house_rent_df['parking_spaces'] / house_rent_df['area']

# let's create a column property_tax_per_area
house_rent_df['property_tax_per_area'] = house_rent_df['property_tax'] / house_rent_df['area']

# let's create a column rent_amount_per_area
house_rent_df['rent_amount_per_area'] = house_rent_df['rent_amount'] / house_rent_df['area']

# let's create a column fire_insurance_per_area
house_rent_df['fire_insurance_per_area'] = house_rent_df['fire_insurance'] / house_rent_df['area']

# let's create a column hoa_per_area
house_rent_df['hoa_per_area'] = house_rent_df['homeowners_association'] / house_rent_df['area']

# let's create columns bathrooms_per_rooms
house_rent_df['bathrooms_per_rooms'] = house_rent_df['bathroom'] / house_rent_df['rooms']

# let's create column a floor_priority ( where from 0 to 3 is 0 , from 4 to 10 is 1, from 11 to 20 is 2, above 20 is 3)
house_rent_df['floor_priority'] = pd.cut(house_rent_df['floor'], bins=[-1, 3, 10, 20, np.inf], labels=[0, 1, 2, 3])

In [None]:
house_rent_df['total_category'].value_counts()

In [None]:
house_rent_df.head(10)

In [None]:
house_rent_df['total_category'].value_counts() / len(house_rent_df)

In [None]:
# let's check  correlation matrix
corr_matrix = house_rent_df.corr()
corr_matrix['total_cost'].sort_values(ascending=False)

In [None]:
# let's use StratifiedShuffleSplit to split dataset
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in split.split(house_rent_df, house_rent_df['total_category']):
    strat_train_set = house_rent_df.iloc[train_index]
    strat_test_set = house_rent_df.iloc[test_index]

In [None]:
# let's check stratification worked
strat_test_set['total_category'].value_counts() / len(strat_test_set)

In [None]:
strat_train_set['total_category'].value_counts() / len(strat_train_set)

In [None]:
X_train, y_train = strat_train_set.drop(columns=['total_cost', 'total_category']), strat_train_set['total_cost']
X_test, y_test = strat_test_set.drop(columns=['total_cost', 'total_category']), strat_test_set['total_cost']

In [None]:
model = LinearRegression()

model.fit(X_train, y_train)

In [None]:
y_train_pred = model.predict(X_train)
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
rmse_train

In [None]:
y_test_pred = model.predict(X_test)
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
rmse_test

In [None]:
score = cross_val_score(model, X_train, y_train, scoring='neg_root_mean_squared_error', cv=5)
score = -score
score

In [None]:
mean_score = score.mean()
mean_score

In [None]:
# let's predict one row and compare with real value
some_data = X_test.iloc[:1]
some_label = y_test.iloc[:1]

print("Predicted:", model.predict(some_data))
print("Label:", list(some_label))

In [None]:
# let's predict all test set and compare with real values and compare error % in separate column
y_pred = model.predict(X_test)
results_df = pd.DataFrame({'Predicted': y_pred, 'Actual': y_test})
results_df['Error'] = results_df['Predicted'] - results_df['Actual']
results_df['Error %'] = (results_df['Error'] / results_df['Actual']).abs() * 100
results_df

In [None]:
results_df.to_csv('house_rent_predictions.csv', index=False)