In [None]:
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error


In [None]:
data = pd.read_csv('/content/property_listing_data_in_Bangladesh.csv')
data = data.drop(columns=['title', 'adress', 'type', 'purpose', 'flooPlan', 'url', 'lastUpdated'])
data.head()

Unnamed: 0,beds,bath,area,price
0,3,4,"2,200 sqft",50 Thousand
1,3,4,"1,400 sqft",30 Thousand
2,3,4,"1,950 sqft",30 Thousand
3,3,3,"2,000 sqft",35 Thousand
4,3,4,"1,650 sqft",25 Thousand


In [None]:
data.shape

(7557, 4)

In [None]:
beds_values = []
for value in data['beds']:
    match = re.search(r'\d+', str(value))
    if match:
        beds_values.append(int(match.group()))
    else:
        beds_values.append(0)
data['beds'] = beds_values
data['beds']

0       3
1       3
2       3
3       3
4       3
       ..
7552    4
7553    3
7554    2
7555    3
7556    4
Name: beds, Length: 7557, dtype: int64

In [None]:
bath_values = []
for value in data['bath']:
    match = re.search(r'\d+', str(value))
    if match:
        bath_values.append(int(match.group()))
    else:
        bath_values.append(0)
data['bath'] = bath_values

data['bath']

0       4
1       4
2       4
3       3
4       4
       ..
7552    4
7553    2
7554    2
7555    4
7556    4
Name: bath, Length: 7557, dtype: int64

In [None]:
area_values = []
for value in data['area']:
    if isinstance(value, str):
        value = value.replace(',', '')
        value = value.split(' ')[0]
        if value.replace('.', '', 1).isdigit():
            area_values.append(float(value))
        else:
            area_values.append(0.0)
    else:
        area_values.append(0.0)
data['area'] = area_values


data['area']

0       2200.0
1       1400.0
2       1950.0
3       2000.0
4       1650.0
         ...  
7552    3600.0
7553     900.0
7554    1000.0
7555    3600.0
7556    2600.0
Name: area, Length: 7557, dtype: float64

In [None]:
price_values = []
for value in data['price']:
    if isinstance(value, str):
        value = value.replace(',', '')
        value = value.replace(' Thousand', '000')
        value = value.replace(' Lakh', '00000')
        if value.replace('.', '', 1).isdigit():
            price_values.append(float(value))
        else:
            price_values.append(0.0)
    else:
        price_values.append(0.0)
data['price'] = price_values


In [None]:
data.head()

Unnamed: 0,beds,bath,area,price
0,3,4,2200.0,50000.0
1,3,4,1400.0,30000.0
2,3,4,1950.0,30000.0
3,3,3,2000.0,35000.0
4,3,4,1650.0,25000.0


In [None]:
scaler = MinMaxScaler()
# scaler = StandardScaler()
numeric_columns = ['beds', 'bath', 'area', 'price']
data[numeric_columns] = scaler.fit_transform(data[numeric_columns])

data.head()

Unnamed: 0,beds,bath,area,price
0,0.057143,0.333333,0.060897,0.024999
1,0.057143,0.333333,0.035256,0.014999
2,0.057143,0.333333,0.052885,0.014999
3,0.057143,0.222222,0.054487,0.017499
4,0.057143,0.333333,0.043269,0.012499


In [None]:
def compute_cost(X, y, weights):
    m = len(y)
    predictions = np.dot(X, weights)
    squared_error = np.square(predictions - y)
    cost = np.sum(squared_error) / (2 * m)

    # print(cost)
    return cost

In [None]:
def gradient_descent(X, y, weights, learning_rate, num_iterations):
    m = len(y)
    costs = []

    for i in range(num_iterations):
        predictions = np.dot(X, weights)
        error = predictions - y
        gradient = np.dot(X.T, error) / m
        weights = weights - learning_rate * gradient
        cost = compute_cost(X, y, weights)
        costs.append(cost)
    # print(weights,cost)
    return weights, costs


In [None]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.25, random_state=42)

X_train = train_data.drop('price', axis=1).values
y_train = train_data['price'].values
X_val = val_data.drop('price', axis=1).values
y_val = val_data['price'].values
X_test = test_data.drop('price', axis=1).values
y_test = test_data['price'].values


In [None]:
X_train = np.hstack((np.ones((X_train.shape[0], 1)), X_train))
X_val = np.hstack((np.ones((X_val.shape[0], 1)), X_val))
X_test = np.hstack((np.ones((X_test.shape[0], 1)), X_test))

weights = np.zeros(X_train.shape[1])


In [None]:
learning_rate = 0.01
num_iterations = 100

weights, costs = gradient_descent(X_train, y_train, weights, learning_rate, num_iterations)



In [None]:
for epoch in range(num_iterations):
    train_cost = compute_cost(X_train, y_train, weights)
    val_cost = compute_cost(X_val, y_val, weights)
    print(f"Epoch {epoch+1}: Training Loss = {train_cost}, Validation Loss = {val_cost}")


Epoch 1: Training Loss = 0.0002650224893794115, Validation Loss = 0.00016571343809800143
Epoch 2: Training Loss = 0.0002650224893794115, Validation Loss = 0.00016571343809800143
Epoch 3: Training Loss = 0.0002650224893794115, Validation Loss = 0.00016571343809800143
Epoch 4: Training Loss = 0.0002650224893794115, Validation Loss = 0.00016571343809800143
Epoch 5: Training Loss = 0.0002650224893794115, Validation Loss = 0.00016571343809800143
Epoch 6: Training Loss = 0.0002650224893794115, Validation Loss = 0.00016571343809800143
Epoch 7: Training Loss = 0.0002650224893794115, Validation Loss = 0.00016571343809800143
Epoch 8: Training Loss = 0.0002650224893794115, Validation Loss = 0.00016571343809800143
Epoch 9: Training Loss = 0.0002650224893794115, Validation Loss = 0.00016571343809800143
Epoch 10: Training Loss = 0.0002650224893794115, Validation Loss = 0.00016571343809800143
Epoch 11: Training Loss = 0.0002650224893794115, Validation Loss = 0.00016571343809800143
Epoch 12: Training 

In [None]:
test_cost = compute_cost(X_test, y_test, weights)
print("Test Loss:", test_cost)

Test Loss: 0.0005213189566025826


In [None]:
y_pred = np.dot(X_test, weights)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error (MSE):", mse)

rmse = np.sqrt(mse)
print("Root Mean Squared Error (RMSE):", rmse)


Mean Squared Error (MSE): 0.0010426379132051653
Root Mean Squared Error (RMSE): 0.032289904199380416


In [None]:
y_pred = np.dot(X_test, weights)

mse = np.mean((y_pred - y_test) ** 2)

rmse = np.sqrt(mse)

print("Mean Squared Error (MSE) on Test Set:", mse)
print("Root Mean Squared Error (RMSE) on Test Set:", rmse)


Mean Squared Error (MSE) on Test Set: 0.0010426379132051653
Root Mean Squared Error (RMSE) on Test Set: 0.032289904199380416
