In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler, LabelEncoder
import warnings
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split


warnings.filterwarnings("ignore")

In [4]:
df = pd.read_csv("../data/raw/CR-advertise-urls.csv")
df.head(10)

Unnamed: 0,propertyID,address,propertyType,priceAUD,priceUSD,publishedDate,lastUpdateDate,buildingSize,bedrooms,bathrooms,parkingSpaces,features,agentName,agency,agentPhone,agentEmail,description
0,310103426772,"[""Atenas"", "" Atenas"", "" Alajuela"", "" Atenas"", ...",Unit,"AUD $1,226",USD $800,Published on: 11 Nov 2025,Last updated on: 13 Nov 2025,80.00 m2,1 bedroom(s),1 bathroom(s),1.0,"[""Outdoor swimming pool"", ""Pool"", ""Remote Gara...",Cynthia Wunderink,Coldwell Banker - Pura Vida Paradise,+506 6043 5090,,Acogedor y funcional apartamento ideal para un...
1,310103425359,"[""Commercial Space for Lease \u00e2\u20ac\u201...",Other,"AUD $55,526","USD $36,225",Published on: 11 Nov 2025,Last updated on: 11 Nov 2025,"1,575.00 m2",,,,,Michelle Zapata Zapata Valerio,DÃºo CR Real Estate,+506 8486 4767,,"Commercial Space for Lease Corner Location, Hi..."
2,310103390700,"[""8"", "" Trinidad"", "" Moravia"", "" San Jose"", "" ...",Unit,"AUD $1,073",USD $700,Published on: 08 Nov 2025,Last updated on: 11 Nov 2025,50.00 m2,3 bedroom(s),1 bathroom(s),1.0,"[""Remote Garage""]",Ingrid Picado Marchena,CENTURY 21 Grano de Oro,+506 7017 8424,,Comfortable apartment for rent in El Roble Con...
3,310103369947,"[""Centro s/n"", "" Turrialba"", "" Turrialba"", "" C...",Unit,AUD $460,USD $300,Published on: 07 Nov 2025,Last updated on: 11 Nov 2025,300.00 m2,,,,,Roy Solano,CENTURY 21 La Campiña,+506 8830 4140,,"?Details: 2 bedrooms, 1 bathroom, living-dinin..."
4,310103369856,"[""La Plaza s/n"", "" Turrialba"", "" Turrialba"", ""...",House,AUD $690,USD $450,Published on: 07 Nov 2025,Last updated on: 11 Nov 2025,160.00 m2,,,,,Roy Solano,CENTURY 21 La Campiña,+506 8830 4140,,Prime central location right across from the P...
5,310103346251,"[""Apartment For Rent in Rohrmoser"", "" Rohrmose...",Apartment,"AUD $2,299","USD $1,500",Published on: 06 Nov 2025,Last updated on: 11 Nov 2025,103.00 m2,2 bedroom(s),3 bathroom(s),,"[""Indoor Furnished""]",Raquel Sandino,Vision Urbana Realty,+506 8491 9696,,This apartment with 2 bedrooms and 3 bathrooms...
6,310103328109,"[""House For Rent in Lindora"", "" Lindora"", "" Sa...",House,"AUD $6,131","USD $4,000",Published on: 05 Nov 2025,Last updated on: 11 Nov 2025,350.00 m2,5 bedroom(s),5 bathroom(s),,"[""Outdoor swimming pool"", ""Garden"", ""Pool""]",Raquel Sandino,Vision Urbana Realty,+506 8491 9696,,This house with 5 bedrooms and 5 bathrooms is ...
7,310103328105,"[""Apartment For Rent in Rohrmoser"", "" Rohrmose...",Apartment,"AUD $2,299","USD $1,500",Published on: 05 Nov 2025,Last updated on: 11 Nov 2025,103.00 m2,2 bedroom(s),3 bathroom(s),,,Raquel Sandino,Vision Urbana Realty,+506 8491 9696,,This apartment with 2 bedrooms and 3 bathrooms...
8,310103328108,"[""House For Rent in Pozos"", "" Pozos"", "" San Jo...",House,"AUD $3,985","USD $2,600",Published on: 05 Nov 2025,Last updated on: 11 Nov 2025,278.00 m2,3 bedroom(s),3 bathroom(s),,"[""Outdoor swimming pool"", ""Garden"", ""Pool""]",Raquel Sandino,Vision Urbana Realty,+506 8491 9696,,This house with 3 bedrooms and 3 bathrooms is ...
9,310103309501,"[""Loft Amueblado de 1 Habitaci\u00f3n en Aveni...",Apartment,"AUD $3,832","USD $2,500",Published on: 04 Nov 2025,Last updated on: 11 Nov 2025,143.00 m2,1 bedroom(s),2 bathroom(s),,"[""Indoor Furnished"", ""Outdoor swimming pool"", ...",Raquel Sandino,Vision Urbana Realty,+506 8491 9696,,This apartment with one bedroom and 2 bathroom...


# Features

In [5]:
import json
import pandas as pd



def parse_features(x):
    if pd.isna(x) or x == "Other":
        return ["Other Features"]
    try:
        return json.loads(x)
    except:
        return ["Other Features"]



df['features'] = df['features'].apply(parse_features)


mlb = MultiLabelBinarizer()

output = mlb.fit_transform(df['features'])
output_df = pd.DataFrame(output, columns=mlb.classes_, index=df.index)
df = pd.concat([df, output_df], axis=1)

In [6]:
vc = df['agency'].value_counts()
rare = vc[vc < 10].index
df['agency'] = df['agency'].replace(rare, 'Other')
df['agency'].value_counts()

agency
PROPIEDADES LEITON                      126
Other                                   111
Live Love Costa Rica                     84
The Agency | Costa Rica                  53
Vision Urbana Realty                     52
RE/MAX CENTRAL                           30
Ram Realtors Costa Rica - Newmark CA     24
Realty ONE Group Costa Rica              22
Overseas Realty Communities              19
CENTURY 21 El Farolito                   17
Domus Verum Real Estate                  16
Coldwell Banker - Pura Vida Paradise     15
RE/MAX OCCIDENTE                         15
DÃºo CR Real Estate                      13
Costa Rica Properties Real Estate        13
Overseas Realty                          13
CENTURY 21 Luxa                          12
Name: count, dtype: int64

In [7]:

imputer = SimpleImputer(strategy='constant', fill_value='Other')
df['propertyType'] = imputer.fit_transform(df[['propertyType']]).ravel()

encoder = LabelEncoder()
df['propertyType'] = encoder.fit_transform(df['propertyType'])

In [8]:
df['buildingSize(m2)'] = df.buildingSize.str.replace(" m2", "").str.replace(",", "").astype(float)
df.drop(columns=['buildingSize',], inplace=True)

In [9]:
df['priceAUD'] = df['priceAUD'].str.replace(",", "").str.replace("AUD $", "")
df['priceUSD'] = df['priceUSD'].str.replace(",", "").str.replace("USD $", "")
df = df[df['priceUSD'].str.isnumeric() == True]

df['priceAUD'] = df['priceAUD'].replace("Other", "0")
df['priceUSD'] = df['priceAUD'].replace("Other", "0")

df['bedrooms'] = df['priceAUD'].replace("Other", "0")
df['bathrooms'] = df['priceAUD'].replace("Other", "0")

df['bedrooms'] = df['bedrooms'].str.replace(" bedroom(s)", "").dropna()
df['bathrooms'] = df['bathrooms'].str.replace(" bathroom(s)", "").dropna()

df['priceAUD'] = pd.to_numeric(df['priceAUD'])
df['priceUSD'] = pd.to_numeric(df['priceUSD'])

In [10]:

imputer = KNNImputer(n_neighbors=3)
df[['buildingSize(m2)', 'priceAUD', 'priceUSD', 'bedrooms', 'bathrooms']] = imputer.fit_transform(df[['buildingSize(m2)', 'priceAUD', 'priceUSD', 'bedrooms', 'bathrooms']])

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 644 entries, 0 to 644
Data columns (total 28 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   propertyID             644 non-null    int64  
 1   address                634 non-null    object 
 2   propertyType           644 non-null    int64  
 3   priceAUD               644 non-null    float64
 4   priceUSD               644 non-null    float64
 5   publishedDate          634 non-null    object 
 6   lastUpdateDate         634 non-null    object 
 7   bedrooms               644 non-null    float64
 8   bathrooms              644 non-null    float64
 9   parkingSpaces          70 non-null     float64
 10  features               644 non-null    object 
 11  agentName              629 non-null    object 
 12  agency                 634 non-null    object 
 13  agentPhone             634 non-null    object 
 14  agentEmail             0 non-null      float64
 15  description

In [12]:
df.drop(columns=['propertyID', 'parkingSpaces', 'agentEmail', 'agentPhone', 'description', 'agentName', 'address', 'publishedDate', 'lastUpdateDate', 'agentPhone', 'features'], inplace=True)

In [13]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df['agency'] = encoder.fit_transform(df['agency'])

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 644 entries, 0 to 644
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   propertyType           644 non-null    int64  
 1   priceAUD               644 non-null    float64
 2   priceUSD               644 non-null    float64
 3   bedrooms               644 non-null    float64
 4   bathrooms              644 non-null    float64
 5   agency                 644 non-null    int64  
 6   AirConditioning        644 non-null    int64  
 7   Balcony                644 non-null    int64  
 8   Barbecue area          644 non-null    int64  
 9   Garden                 644 non-null    int64  
 10  Heating                644 non-null    int64  
 11  Indoor Furnished       644 non-null    int64  
 12  Other Features         644 non-null    int64  
 13  Outdoor swimming pool  644 non-null    int64  
 14  Pool                   644 non-null    int64  
 15  Remote Gara

In [15]:
X = df.drop(columns=['priceAUD', 'priceUSD'])
y = df[['priceAUD', 'priceUSD']]

scaler = StandardScaler()
scaled = scaler.fit_transform(X)

x_train, x_test, y_train, y_test = train_test_split(scaled, y, test_size=0.2, random_state=123)

In [16]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(x_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [17]:
y_pred = model.predict(x_test)


In [18]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error


print('R2 Score: ', r2_score(y_pred=y_pred, y_true=y_test))
print("Mean Absolute ERROR: ", mean_absolute_error(y_pred=y_pred, y_true=y_test))
print("Mean Squared ERROR: ", mean_squared_error(y_pred=y_pred, y_true=y_test))

R2 Score:  1.0
Mean Absolute ERROR:  4.1963662519011384e-11
Mean Squared ERROR:  3.330421513722804e-21


In [19]:
from sklearn.svm import SVR


model = LinearRegression()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print('R2 Score: ', r2_score(y_pred=y_pred, y_true=y_test))
print("Mean Absolute ERROR: ", mean_absolute_error(y_pred=y_pred, y_true=y_test))
print("Mean Squared ERROR: ", mean_squared_error(y_pred=y_pred, y_true=y_test))


R2 Score:  1.0
Mean Absolute ERROR:  4.1963662519011384e-11
Mean Squared ERROR:  3.330421513722804e-21
