## Step 1: Data cleaning

In [16]:
import pandas as pd
import numpy as np

# Load the DataFrame
df = pd.read_csv('../data/property_data.csv')

# Remove any duplicate rows from the dataframe
df = df.drop_duplicates()

# Dropping the column 'Raw num:'
df = df.drop(columns=['Raw num:', 'ID number'])


# Keep only numeric columns in dataframe
df = df.select_dtypes(include=[np.number])

# Replace any missing values with 0 in these numerical columns
df.fillna(0, inplace=True)

df.head()

Unnamed: 0,Zip code,Kitchen,Price of property in euro,Number of bedrooms,Living area,Terrace,Terrace area,Garden,Garden area,Surface of the land(or plot of land),Number of facades,Swimming pool
0,1170.0,1.0,749000.0,5.0,205.0,1.0,14.0,1.0,152.0,220.0,2.0,0.0
1,1933.0,1.0,755000.0,5.0,241.0,1.0,33.0,1.0,300.0,500.0,4.0,0.0
2,1300.0,1.0,435000.0,4.0,185.0,1.0,30.0,1.0,526.0,748.0,4.0,0.0
3,1420.0,1.0,320000.0,3.0,88.0,1.0,20.0,1.0,250.0,405.0,3.0,0.0
4,4420.0,1.0,319000.0,3.0,194.0,1.0,34.0,1.0,82.0,197.0,2.0,0.0


## Step 2: Data formatting

In [19]:
from sklearn.model_selection import train_test_split

X = df.drop(columns='Price of property in euro')  # Features
y = df['Price of property in euro']  # Target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Step 3: Model selection

### Linear Regression Model

In [20]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Create a Linear Regression model
linear_reg = LinearRegression()

# Train the model
linear_reg.fit(X_train, y_train)

# Make predictions
y_pred = linear_reg.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Linear Regression MSE: {mse}")


Linear Regression MSE: 160943362629.21606


### Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor

# Create a Decision Tree Regressor model
tree_reg = DecisionTreeRegressor(random_state=42)

# Train the model
tree_reg.fit(X_train, y_train)

# Make predictions
y_pred = tree_reg.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Decision Tree MSE: {mse}")
