## preprocessing

In [1]:
import re

import pandas as pd

data_1 = pd.read_csv('bina_az_02102023.csv')
data_2 = pd.read_csv('bina_az_new.csv')
data_3 = pd.read_csv('bina_az_old.csv')
data_4 = pd.read_csv('bina_az_25102023.csv')
frames = pd.concat([data_1, data_2, data_3,data_4]).drop_duplicates().dropna()
frames['is_near_metro'] = (frames['description'].str.contains('m\.', case=False) | frames['description'].str.contains('metro',case=False)).astype(int)
frames = frames[frames['seller_type'] != 'seller_type']
frames[['flat', 'total_flat']] = frames['flat_number'].str.split(' / ', expand=True).astype(int)
remove_non_numeric_and_convert_to_float = lambda value: float(re.sub(r'[^\d.]', '', value)) if value else None
frames['area_converted'] = frames['area'].apply(remove_non_numeric_and_convert_to_float)
frames['room_count'] = frames['room_count'].astype(int)
frames['documents_encoded'] = frames['documents'].map({'var': 1, 'yoxdur': 0})
frames['is_repair_encoded'] = frames['is_repair'].map({'var': 1, 'yoxdur': 0})
frames['seller_type_encoded'] = frames['seller_type'].map({'vasitəçi (agent)': 0, 'mülkiyyətçi': 1})
frames['category_encoded'] = frames['category'].map({'Yeni tikili': 0, 'Köhnə tikili': 1})
frames['price'] = frames['price'].str.replace(' ', '').astype(int)
frames = frames[['is_near_metro', 
                 'seller_type_encoded', 
                 'flat', 
                 'total_flat', 
                 'room_count',
                 'area_converted', 
                 'category_encoded',
                 'documents_encoded',
                 'is_repair_encoded', 
                 'price']].drop_duplicates(ignore_index=True)
# frames.to_excel('frames.xlsx', index=False)

## modelling

In [3]:
import warnings
import joblib
import pandas as pd
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split
warnings.filterwarnings("ignore", category=FutureWarning)
data = frames
X = data.drop(columns=['price'])
y = data['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = xgb.XGBRegressor(
    learning_rate=0.1,
    n_estimators=100,
    max_depth=3,
    objective='reg:squarederror')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
print("Model perforamnce metrics")
print("-----------------------")
print(f"R-squared: {r2:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"Mean Absolute Error: {mae:.2f}")
print("-----------------------")
# joblib.dump(model, 'xgb.pkl')

Model perforamnce metrics
-----------------------
R-squared: 0.71
Root Mean Squared Error: 94855.12
Mean Absolute Error: 47748.10
-----------------------


## neural network

In [15]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

data = frames
X = data.drop(columns=['price'])
y = data['price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = keras.Sequential([
    keras.layers.Input(shape=(X_train.shape[1],)),  # Input layer
    keras.layers.Dense(128, activation='relu'),  # Hidden layer with 128 units and ReLU activation
    keras.layers.Dense(64, activation='relu'),   # Hidden layer with 64 units and ReLU activation
    keras.layers.Dense(1)  # Output layer for regression
])

# Compile the model with MAE as the sole evaluation metric
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

history = model.fit(X_train, y_train, epochs=10000, batch_size=32, validation_split=0.2, verbose=2)

# Make predictions
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Model performance metrics")
print("-----------------------")
print(f"R-squared: {r2:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print("-----------------------")


Epoch 1/10000
568/568 - 1s - loss: 328227225600.0000 - mae: 193521.5469 - val_loss: 42244894720.0000 - val_mae: 135888.5781 - 1s/epoch - 2ms/step
Epoch 2/10000
568/568 - 1s - loss: 329565503488.0000 - mae: 131774.2344 - val_loss: 36771012608.0000 - val_mae: 117440.5469 - 885ms/epoch - 2ms/step
Epoch 3/10000
568/568 - 1s - loss: 301008748544.0000 - mae: 119897.7422 - val_loss: 34993180672.0000 - val_mae: 109625.9219 - 790ms/epoch - 1ms/step
Epoch 4/10000
568/568 - 1s - loss: 319097667584.0000 - mae: 107842.9688 - val_loss: 32223926272.0000 - val_mae: 98919.2812 - 853ms/epoch - 2ms/step
Epoch 5/10000
568/568 - 1s - loss: 275867009024.0000 - mae: 106116.6484 - val_loss: 31593693184.0000 - val_mae: 95134.0078 - 800ms/epoch - 1ms/step
Epoch 6/10000
568/568 - 1s - loss: 259320414208.0000 - mae: 99024.6719 - val_loss: 31264335872.0000 - val_mae: 92709.6797 - 748ms/epoch - 1ms/step
Epoch 7/10000
568/568 - 1s - loss: 225723629568.0000 - mae: 96486.7891 - val_loss: 31509026816.0000 - val_mae: 92

In [16]:
model.save('neural_network_model.h5')

  saving_api.save_model(


In [18]:
model.save('my_model.keras')