# Multiple Regression Model For Rent Price



## Data Preprocessing


Read Data

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error

df = pd.read_csv("data/condos.csv")
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4575 entries, 0 to 4574
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Title        4541 non-null   object 
 1   Baths        4312 non-null   float64
 2   Floor        934 non-null    float64
 3   Usable area  3816 non-null   float64
 4   Beds         4367 non-null   float64
 5   Rent Price   4541 non-null   float64
 6   Location     4541 non-null   object 
 7   Url          4575 non-null   object 
dtypes: float64(5), object(3)
memory usage: 286.1+ KB
None
                                               Title  Baths  Floor  \
0  The Montane Penthouse Unit Modern Japanese Sty...    2.0   33.0   
1  Pacific Plaza Towers North – Fully-Furnished 3...    3.0   17.0   
2  2 Bedroom For Lease in West Gallery Place, Tag...    2.0   18.0   
3                  For Lease: East Gallery Place BGC    2.0    9.0   
4  LEASE TO OWN Condo For Sale The Trion Towers R...    1.0   50.

Handle Missing Data

Baths = normal shaped (mean/median)


In [3]:
df['Baths'].describe()

count    4312.000000
mean        1.948980
std         4.653994
min         1.000000
25%         1.000000
50%         2.000000
75%         2.000000
max       297.000000
Name: Baths, dtype: float64

Floor = right-skewed (median)


In [4]:
df['Floor'].describe()

count     934.000000
mean       31.205567
std       127.725492
min         1.000000
25%        12.000000
50%        22.000000
75%        35.000000
max      3708.000000
Name: Floor, dtype: float64

Usable area = normal shaped (mean)

In [5]:
df['Usable area'].describe()

count    3816.000000
mean      111.145807
std        73.037839
min         1.000000
25%        59.000000
50%       100.000000
75%       134.000000
max       700.000000
Name: Usable area, dtype: float64

Show total null

In [6]:
df.isnull().sum()

Title            34
Baths           263
Floor          3641
Usable area     759
Beds            208
Rent Price       34
Location         34
Url               0
dtype: int64

Remove/Replace null

In [7]:
df.dropna(subset=['Rent Price'], inplace=True)
df['Baths'] = df['Baths'].fillna(df['Baths'].mean())
df['Floor'] = df['Floor'].fillna(df['Floor'].median())
df['Usable area'] = df['Usable area'].fillna(df['Usable area'].mean())

In [8]:
df.isnull().sum()

Title            0
Baths            0
Floor            0
Usable area      0
Beds           174
Rent Price       0
Location         0
Url              0
dtype: int64

In [9]:
df = df.drop_duplicates()

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3509 entries, 0 to 4560
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Title        3509 non-null   object 
 1   Baths        3509 non-null   float64
 2   Floor        3509 non-null   float64
 3   Usable area  3509 non-null   float64
 4   Beds         3371 non-null   float64
 5   Rent Price   3509 non-null   float64
 6   Location     3509 non-null   object 
 7   Url          3509 non-null   object 
dtypes: float64(5), object(3)
memory usage: 246.7+ KB


In [11]:
df['Beds'] = df['Beds'].fillna(0)

In [12]:
df['Beds'] = df['Beds'].astype(int)
df['Baths'] = df['Baths'].astype(int)
df['Floor'] = df['Floor'].astype(int)

In [13]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 3509 entries, 0 to 4560
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Title        3509 non-null   object 
 1   Baths        3509 non-null   int64  
 2   Floor        3509 non-null   int64  
 3   Usable area  3509 non-null   float64
 4   Beds         3509 non-null   int64  
 5   Rent Price   3509 non-null   float64
 6   Location     3509 non-null   object 
 7   Url          3509 non-null   object 
dtypes: float64(2), int64(3), object(3)
memory usage: 246.7+ KB


Unnamed: 0,Title,Baths,Floor,Usable area,Beds,Rent Price,Location,Url
0,The Montane Penthouse Unit Modern Japanese Sty...,2,33,79.0,2,95000.0,"Metro Manila, Taguig, BGC",https://www.dotproperty.com.ph/ads/2-bedroom-c...
1,Pacific Plaza Towers North – Fully-Furnished 3...,3,17,300.0,3,280000.0,"Metro Manila, Taguig, BGC",https://www.dotproperty.com.ph/ads/3-bedroom-c...
2,"2 Bedroom For Lease in West Gallery Place, Tag...",2,18,136.0,2,240000.0,"Metro Manila, Taguig, BGC",https://www.dotproperty.com.ph/ads/2-bedroom-c...
3,For Lease: East Gallery Place BGC,2,9,137.0,2,195000.0,"Metro Manila, Taguig, BGC",https://www.dotproperty.com.ph/ads/2-bedroom-c...
4,LEASE TO OWN Condo For Sale The Trion Towers R...,1,50,42.34,1,100000.0,"Metro Manila, Taguig, BGC",https://www.dotproperty.com.ph/ads/1-bedroom-c...


Clean the data

In [14]:
df_clean = df.drop(columns=['Title', 'Location', 'Url', 'Floor'])

In [15]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3509 entries, 0 to 4560
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Baths        3509 non-null   int64  
 1   Usable area  3509 non-null   float64
 2   Beds         3509 non-null   int64  
 3   Rent Price   3509 non-null   float64
dtypes: float64(2), int64(2)
memory usage: 137.1 KB


Split data into 2

In [16]:
X = df_clean.drop(columns=['Rent Price'])

y = df_clean['Rent Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Model Training

In [17]:
model = LinearRegression()
model.fit(X_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [18]:
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred) * 100
r2 = r2_score(y_test, y_pred)

print("MAE:", mae)
print("MSE:", mse)
print("R²:", r2)
print("MAPE:", mape, "%")

MAE: 31581.627580998607
MSE: 2344705936.6010013
R²: 0.5873078061174517
MAPE: 40.70529558102486 %
