In [70]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error, r2_score

In [71]:
data = pd.read_csv('/content/clean_data-kaggle.csv')

In [72]:
data.head()

Unnamed: 0,Location,Price (RM),Rooms,Bathrooms,Car Parks,Property Type,Size (sqft),Furnishing
0,KLCC,1250000,2,3.0,2.0,Serviced Residence,1335.0,Fully Furnished
1,Dutamas,1030000,3,4.0,2.0,Condominium,1875.0,Partly Furnished
2,Bukit Jalil,900000,4,3.0,2.0,Condominium,1513.0,Partly Furnished
3,Taman Tun Dr Ismail,5350000,4,5.0,4.0,Bungalow,7200.0,Partly Furnished
4,Taman Tun Dr Ismail,2600000,5,4.0,4.0,Semi-detached House,3600.0,Partly Furnished


In [73]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29032 entries, 0 to 29031
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Location       29031 non-null  object 
 1   Price (RM)     29032 non-null  int64  
 2   Rooms          29032 non-null  object 
 3   Bathrooms      29032 non-null  float64
 4   Car Parks      29032 non-null  float64
 5   Property Type  29032 non-null  object 
 6   Size (sqft)    28996 non-null  float64
 7   Furnishing     29032 non-null  object 
dtypes: float64(3), int64(1), object(4)
memory usage: 1.8+ MB


In [74]:
data['Rooms'] = data['Rooms'].replace({'studio': 0, '20 Above': 20, 'Studio':0}).astype(int)
data.dropna()

Unnamed: 0,Location,Price (RM),Rooms,Bathrooms,Car Parks,Property Type,Size (sqft),Furnishing
0,KLCC,1250000,2,3.0,2.0,Serviced Residence,1335.0,Fully Furnished
1,Dutamas,1030000,3,4.0,2.0,Condominium,1875.0,Partly Furnished
2,Bukit Jalil,900000,4,3.0,2.0,Condominium,1513.0,Partly Furnished
3,Taman Tun Dr Ismail,5350000,4,5.0,4.0,Bungalow,7200.0,Partly Furnished
4,Taman Tun Dr Ismail,2600000,5,4.0,4.0,Semi-detached House,3600.0,Partly Furnished
...,...,...,...,...,...,...,...,...
29027,Seputeh,750000,3,2.0,1.0,Condominium,915.0,Partly Furnished
29028,KL Sentral,1400000,3,3.0,2.0,Condominium,1544.0,Fully Furnished
29029,KL Eco City,880000,1,1.0,1.0,Condominium,650.0,Partly Furnished
29030,Sri Hartamas,2700000,4,6.0,3.0,Condominium,3973.0,Partly Furnished


In [75]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29032 entries, 0 to 29031
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Location       29031 non-null  object 
 1   Price (RM)     29032 non-null  int64  
 2   Rooms          29032 non-null  int64  
 3   Bathrooms      29032 non-null  float64
 4   Car Parks      29032 non-null  float64
 5   Property Type  29032 non-null  object 
 6   Size (sqft)    28996 non-null  float64
 7   Furnishing     29032 non-null  object 
dtypes: float64(3), int64(2), object(3)
memory usage: 1.8+ MB


In [76]:
label_encoders = {}
for column in ['Location', 'Property Type', 'Furnishing']:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

In [77]:
data.head()

Unnamed: 0,Location,Price (RM),Rooms,Bathrooms,Car Parks,Property Type,Size (sqft),Furnishing
0,40,1250000,2,3.0,2.0,16,1335.0,0
1,26,1030000,3,4.0,2.0,12,1875.0,1
2,11,900000,4,3.0,2.0,12,1513.0,1
3,73,5350000,4,5.0,4.0,9,7200.0,1
4,73,2600000,5,4.0,4.0,15,3600.0,1


In [78]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29032 entries, 0 to 29031
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Location       29032 non-null  int64  
 1   Price (RM)     29032 non-null  int64  
 2   Rooms          29032 non-null  int64  
 3   Bathrooms      29032 non-null  float64
 4   Car Parks      29032 non-null  float64
 5   Property Type  29032 non-null  int64  
 6   Size (sqft)    28996 non-null  float64
 7   Furnishing     29032 non-null  int64  
dtypes: float64(3), int64(5)
memory usage: 1.8 MB


In [79]:
print(data.isnull().sum())

Location          0
Price (RM)        0
Rooms             0
Bathrooms         0
Car Parks         0
Property Type     0
Size (sqft)      36
Furnishing        0
dtype: int64


In [80]:
# Fill missing 'Size (sqft)' values with the median value
median_size = data['Size (sqft)'].median()
data['Size (sqft)'].fillna(median_size, inplace=True)

In [81]:
print(data.isnull().sum())

Location         0
Price (RM)       0
Rooms            0
Bathrooms        0
Car Parks        0
Property Type    0
Size (sqft)      0
Furnishing       0
dtype: int64


In [82]:
X = data.drop(columns=['Price (RM)'])
y = data['Price (RM)']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [83]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [84]:
#Model Training
model = LinearRegression()
model.fit(X_train, y_train)

In [85]:
#Evaluate Model
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [86]:
#Output
print(f"MAE: {mae}")
print(f"R²: {r2}")

MAE: 821070.516482289
R²: 0.4503039888172008
