##1) Real Sum Predictor

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from lightgbm import LGBMRegressor


#Load and preprocessing
df = pd.read_csv("final_df2.csv")
df = df.drop(columns=["Unnamed: 0"])

#Remove extreme outliers
upper_limit = df["realSum"].quantile(0.99)  # keep 99% of data
df = df[df["realSum"] <= upper_limit]


#Target and features
X = df.drop(columns=["realSum"])
y = np.log1p(df["realSum"])  # log transform price


#Feature groups
categorical_features = [
    "room_type",
    "day_type",
    "city",
    "country"
]

boolean_features = [
    "room_shared",
    "room_private",
    "host_is_superhost"
]

numeric_features = [
    "person_capacity",
    "multi",
    "biz",
    "cleanliness_rating",
    "guest_satisfaction_overall",
    "bedrooms",
    "dist",
    "metro_dist",
    "attr_index_norm",
    "rest_index_norm",
    "lat",
    "lng"
]


# more Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ("num", "passthrough", numeric_features),
        ("bool", "passthrough", boolean_features),
    ]
)

# the model
model = LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=-1,
    num_leaves=50,
    random_state=42
)


# Pipeline
pipeline = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("model", model)
    ]
)


#Train / Test split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


#Training
pipeline.fit(X_train, y_train)


#Evaluation

y_pred_log = pipeline.predict(X_test)
y_pred = np.expm1(y_pred_log)   # revert log
y_true = np.expm1(y_test)

mae = mean_absolute_error(y_true, y_pred)
mse = mean_squared_error(y_true, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_true, y_pred)
median_error = np.median(np.abs(y_true - y_pred))

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"Median Error: {median_error:.2f}")
print(f"R²: {r2:.3f}")

# tolerance accuracy (±10%)
tolerance = 0.10
tolerance_acc = np.mean(np.abs(y_pred - y_true) / y_true < tolerance)
print(f"Tolerance Accuracy (±10%): {tolerance_acc:.2%}")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002067 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1762
[LightGBM] [Info] Number of data points in the train set: 15955, number of used features: 93
[LightGBM] [Info] Start training from score 5.267918




MAE: 43.64
RMSE: 75.08
Median Error: 25.00
R²: 0.770
Tolerance Accuracy (±10%): 36.63%


In [None]:
sample = pd.DataFrame([{
    "room_type": "Entire home/apt",
    "room_shared": False,
    "room_private": False,
    "person_capacity": 4,
    "host_is_superhost": True,
    "multi": 0,
    "biz": 1,
    "cleanliness_rating": 9.5,
    "guest_satisfaction_overall": 95,
    "bedrooms": 2,
    "dist": 2.1,
    "metro_dist": 0.4,
    "attr_index_norm": 1.2,
    "rest_index_norm": 1.1,
    "lat": 48.8566,
    "lng": 2.3522,
    "day_type": "weekend",
    "city": "Paris",
    "country": "France"
}])

log_price = pipeline.predict(sample)[0]
price = np.expm1(log_price)

print(f"Predicted price: {price:.2f}")


Predicted price: 292.17




##2) City Predictor

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix


# Load data and preprocess

df = pd.read_csv("final_df2.csv")
df = df.drop(columns=["Unnamed: 0"])


#keep top 20 cities for speed

top_cities = df['city'].value_counts().head(20).index
df = df[df['city'].isin(top_cities)]


#Features and target

X = df[['realSum', 'person_capacity', 'bedrooms',
        'room_shared', 'room_private', 'host_is_superhost']]

# Convert booleans to int (use safe way)
X.loc[:, ['room_shared','room_private','host_is_superhost']] = X[['room_shared','room_private','host_is_superhost']].astype(int)

y = df['city']


# Encode target

le = LabelEncoder()
y_encoded = le.fit_transform(y)
num_classes = len(le.classes_)
y_categorical = to_categorical(y_encoded, num_classes=num_classes)


# Train/test split

X_train, X_test, y_train, y_test = train_test_split(
    X, y_categorical, test_size=0.2, random_state=42, stratify=y_encoded
)


# Scale numeric features

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# neural network

model = Sequential([
    Input(shape=(X_train_scaled.shape[1],)),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dense(num_classes, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


#Training loop
history = model.fit(X_train_scaled, y_train,
                    epochs=30,
                    batch_size=64,
                    validation_split=0.1,
                    verbose=2)


# Evaluation

y_pred_prob = model.predict(X_test_scaled)
y_pred = np.argmax(y_pred_prob, axis=1)
y_true = np.argmax(y_test, axis=1)

acc = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, average='weighted')
cm = confusion_matrix(y_true, y_pred)

print(f"\nAccuracy: {acc:.3f}")
print(f"Weighted F1: {f1:.3f}")
print("Confusion Matrix:")
print(cm)


# Top-5 accuracy

top_k = 5
top_k_preds = np.argsort(y_pred_prob, axis=1)[:, -top_k:]
top_k_correct = [y_true[i] in top_k_preds[i] for i in range(len(y_true))]
top_k_accuracy = np.mean(top_k_correct)
print(f"Top-{top_k} accuracy: {top_k_accuracy:.3f}")



  X.loc[:, ['room_shared','room_private','host_is_superhost']] = X[['room_shared','room_private','host_is_superhost']].astype(int)
  X.loc[:, ['room_shared','room_private','host_is_superhost']] = X[['room_shared','room_private','host_is_superhost']].astype(int)
  X.loc[:, ['room_shared','room_private','host_is_superhost']] = X[['room_shared','room_private','host_is_superhost']].astype(int)


Epoch 1/30
388/388 - 4s - 10ms/step - accuracy: 0.2921 - loss: 2.3470 - val_accuracy: 0.3365 - val_loss: 2.0414
Epoch 2/30
388/388 - 1s - 3ms/step - accuracy: 0.3306 - loss: 2.0658 - val_accuracy: 0.3586 - val_loss: 1.9694
Epoch 3/30
388/388 - 1s - 2ms/step - accuracy: 0.3420 - loss: 2.0073 - val_accuracy: 0.3623 - val_loss: 1.9239
Epoch 4/30
388/388 - 1s - 2ms/step - accuracy: 0.3431 - loss: 1.9852 - val_accuracy: 0.3583 - val_loss: 1.9147
Epoch 5/30
388/388 - 1s - 2ms/step - accuracy: 0.3473 - loss: 1.9714 - val_accuracy: 0.3637 - val_loss: 1.9054
Epoch 6/30
388/388 - 1s - 2ms/step - accuracy: 0.3499 - loss: 1.9615 - val_accuracy: 0.3641 - val_loss: 1.9055
Epoch 7/30
388/388 - 1s - 2ms/step - accuracy: 0.3467 - loss: 1.9556 - val_accuracy: 0.3626 - val_loss: 1.8975
Epoch 8/30
388/388 - 1s - 2ms/step - accuracy: 0.3507 - loss: 1.9479 - val_accuracy: 0.3662 - val_loss: 1.8969
Epoch 9/30
388/388 - 1s - 2ms/step - accuracy: 0.3529 - loss: 1.9442 - val_accuracy: 0.3673 - val_loss: 1.8897


In [None]:
sample = pd.DataFrame([{
    'realSum': 120,
    'person_capacity': 2,
    'bedrooms': 1,
    'room_shared': 0,
    'room_private': 0,
    'host_is_superhost': 1
}])

sample_scaled = scaler.transform(sample)
sample_pred_prob = model.predict(sample_scaled)[0]

# Get top 5 city indices
top_5_indices = np.argsort(sample_pred_prob)[-5:][::-1]  # descending order
top_5_cities = le.inverse_transform(top_5_indices)
top_5_probs = sample_pred_prob[top_5_indices]

print("\nTop 5 predicted cities for sample listing:")
for city, prob in zip(top_5_cities, top_5_probs):
    print(f"{city}: {prob:.3f}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step

Top 5 predicted cities for sample listing:
Athens: 0.221
Kipseli: 0.204
Budapest VI. keruelet: 0.141
Vyronas: 0.122
Rome: 0.098
