In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from lls import LinearLeastSquare

In [2]:
data = pd.read_csv("HousePrice.csv")

In [3]:
# Clean data
data = data[data['Area'].apply(lambda x: str(x).isnumeric())]  # Keep rows where 'Area' is numeric
data.drop_duplicates(inplace=True)
data.dropna(inplace=True)
data.reset_index(drop=True, inplace=True)

In [4]:
# Update dollar price to July 2023 exchange rate
data['Price'] = pd.to_numeric(data['Price'], errors='coerce')  # Convert 'Price' to numeric
data['Price(USD)'] = data['Price'] / 480000


In [5]:
# Display the 5 most expensive houses' addresses and prices
data_expensive = data.sort_values(by='Price', ascending=False)
print("5 Most Expensive Houses:")
print(data_expensive[['Address', 'Price']].head())


5 Most Expensive Houses:
         Address         Price
1606  Zaferanieh  9.240000e+10
1704      Abazar  9.100000e+10
405      Lavasan  8.500000e+10
770   Ekhtiarieh  8.160000e+10
1249    Niavaran  8.050000e+10


In [6]:
# Prepare features for training
data['Parking'] = data['Parking'].replace([True, False], [1, 0])
data['Warehouse'] = data['Warehouse'].replace([True, False], [1, 0])
data['Elevator'] = data['Elevator'].replace([True, False], [1, 0])


In [7]:
# Define X and Y
X = data[['Area', 'Room', 'Parking', 'Warehouse', 'Elevator']].to_numpy().astype(int)
Y = data[['Price']].to_numpy().reshape(-1, 1)  # Ensure Y is a column vector


In [8]:
# Check data after processing
print(data.head())

  Area  Room  Parking  Warehouse  Elevator         Address         Price  \
0   63     1        1          1         1         Shahran  1.850000e+09   
1   60     1        1          1         1         Shahran  1.850000e+09   
2   79     2        1          1         1          Pardis  5.500000e+08   
3   95     2        1          1         1   Shahrake Qods  9.025000e+08   
4  123     2        1          1         1  Shahrake Gharb  7.000000e+09   

     Price(USD)  
0   3854.166667  
1   3854.166667  
2   1145.833333  
3   1880.208333  
4  14583.333333  


In [9]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression

# Step 1: Split the dataset into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42, shuffle=True)

In [10]:
# Step 2: Fit the LLS model on the training dataset
lls = LinearLeastSquare()
lls.fit(X_train, Y_train)

array([[ 8.12538300e+07],
       [-1.57270010e+08],
       [-3.45057867e+08],
       [-2.52829904e+09],
       [-3.52786972e+07]])

In [11]:
# Step 3: Evaluate the model on the test dataset using MAE, MSE, and RMSE
Y_pred_lls = lls.predict(X_test)

In [12]:
# Define evaluation functions
def mean_absolute_error_custom(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred))

def mean_squared_error_custom(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

def root_mean_squared_error_custom(y_true, y_pred):
    return np.sqrt(mean_squared_error_custom(y_true, y_pred))


In [14]:
# Calculate metrics for LLS model
mae_lls = mean_absolute_error_custom(Y_test, Y_pred_lls)
mse_lls = mean_squared_error_custom(Y_test, Y_pred_lls)
rmse_lls = root_mean_squared_error_custom(Y_test, Y_pred_lls)

print("LLS Model Evaluation:")
print("MAE:", mae_lls)
print("MSE:", mse_lls)
print("RMSE:", rmse_lls)

LLS Model Evaluation:
MAE: 3145859531.486994
MSE: 3.944041367901528e+19
RMSE: 6280160322.715917


Compare result with Scikit-Learn's results

In [15]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [16]:
x_train , x_test , y_train , y_test = train_test_split(X,Y , test_size=0.2 , random_state=40)
x_train.shape , y_train.shape , x_test.shape , y_test.shape 

((2593, 5), (2593, 1), (649, 5), (649, 1))

In [17]:
linear_linreg = LinearRegression()
linear_linreg.fit(x_train , y_train)
y_pred_linreg = linear_linreg.predict(x_test)

In [18]:
MAE_linreg = mean_absolute_error(y_test , y_pred_linreg)
MSE_linreg = mean_squared_error(y_test , y_pred_linreg)
RMSE_linreg = np.sqrt(MSE_linreg)

print("LinearRegression MAE =" , MAE_linreg)
print("LinearRegression MSE =" , MSE_linreg)
print("LinearRegression RMSE =" , RMSE_linreg)

LinearRegression MAE = 2924869589.0485244
LinearRegression MSE = 4.074285732083931e+19
LinearRegression RMSE = 6383013185.074844


In [19]:
# RidgeCV
from sklearn.linear_model import RidgeCV  # Linear least squares with l2 regularization

rg = RidgeCV()
rg.fit(x_train , y_train)
y_pred_rg= rg.predict(x_test)

MAE_ridge = mean_absolute_error(y_test , y_pred_rg)
MSE_ridge = mean_squared_error(y_test , y_pred_rg)
RMSE_ridge = np.sqrt(MSE_ridge)


print("RidgeCV MAE =" , MAE_ridge)
print("RidgeCV MSE =" , MSE_ridge)
print("RidgeCV RMSE =" , RMSE_ridge)

RidgeCV MAE = 2921432814.4555564
RidgeCV MSE = 4.076564555644759e+19
RidgeCV RMSE = 6384798004.357506
