In [1]:
import numpy as np
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    ConfusionMatrixDisplay,
    accuracy_score,
    balanced_accuracy_score,
    classification_report,
    confusion_matrix,
)
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

np.random.seed(42)
pd.set_option("display.max_rows", 80)
pd.set_option("display.width", 1920)
pd.set_option("display.float_format", "{:20,.2f}".format)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)
plt.rcParams["figure.dpi"] = 150

google_api = ""

df = pd.read_csv("./normalized_nybnb.csv").astype(np.float32)
X, y = df.drop(columns=["High Review Score"]), df["High Review Score"]

ros = RandomOverSampler(random_state=42)
X, y = ros.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.1,
    random_state=42,
    stratify=y,
)

target_names = ["Low", "High"]
n_features = X_train.shape[1]

classifiers = {
    "Naive Bayes": GaussianNB(),
    "KNN": KNeighborsClassifier(),
    "Logistic Regression": LogisticRegression(solver="liblinear", multi_class="ovr"),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
}
for cls in classifiers.keys():
    classifiers[cls].fit(X_train, y_train)
    print(cls, classifiers[cls])


Naive Bayes GaussianNB()
KNN KNeighborsClassifier()
Logistic Regression LogisticRegression(multi_class='ovr', solver='liblinear')
Decision Tree DecisionTreeClassifier()
Random Forest RandomForestClassifier()


In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from pickle import load

def read_scaler() -> StandardScaler:
    return load(open('scaler.pkl', 'rb'))
scaler = read_scaler()

In [3]:
unnormed_X = pd.DataFrame(
    scaler.inverse_transform(
        X[
            [
                "Host Listings Count",
                "Host Total Listings Count",
                "Calculated host listings count",
                "Latitude",
                "Longitude",
                "Accommodates",
                "Bathrooms",
                "Bedrooms",
                "Beds",
                "Price",
                "Weekly Price",
                "Monthly Price",
                "Security Deposit",
                "Cleaning Fee",
                "Guests Included",
                "Extra People",
                "Minimum Nights",
                "Maximum Nights",
                "Number of Reviews",
                "Reviews per Month",
                "Host_Time",
                "Review Time Span",
            ]
        ],
        copy=None,
    ),
    columns=[
        "Host Listings Count",
        "Host Total Listings Count",
        "Calculated host listings count",
        "Latitude",
        "Longitude",
        "Accommodates",
        "Bathrooms",
        "Bedrooms",
        "Beds",
        "Price",
        "Weekly Price",
        "Monthly Price",
        "Security Deposit",
        "Cleaning Fee",
        "Guests Included",
        "Extra People",
        "Minimum Nights",
        "Maximum Nights",
        "Number of Reviews",
        "Reviews per Month",
        "Host_Time",
        "Review Time Span",
    ],
)
unnormed_X["High Review Score"] = df["High Review Score"]

In [4]:
unnormed_X.head(3)

Unnamed: 0,Host Listings Count,Host Total Listings Count,Calculated host listings count,Latitude,Longitude,Accommodates,Bathrooms,Bedrooms,Beds,Price,Weekly Price,Monthly Price,Security Deposit,Cleaning Fee,Guests Included,Extra People,Minimum Nights,Maximum Nights,Number of Reviews,Reviews per Month,Host_Time,Review Time Span,High Review Score
0,1.0,1.0,1.0,40.74,-74.0,2.0,1.0,1.0,1.0,110.0,770.0,3300.0,200.0,75.0,1.0,0.0,8.0,1125.0,3.0,0.29,319.0,280.0,0.0
1,1.0,1.0,1.0,40.74,-74.0,2.0,1.0,-0.0,1.0,120.0,840.0,3600.0,120.0,75.0,1.0,0.0,3.0,1125.0,12.0,0.6,1335.0,579.0,1.0
2,1.0,1.0,1.0,40.75,-74.01,2.0,1.0,1.0,1.0,199.0,1393.0,5970.0,600.0,120.0,1.0,25.0,6.0,12.0,14.0,0.2,2416.0,2105.0,1.0


In [5]:
from bokeh.io import output_notebook, show
from bokeh.models import ColumnDataSource, GMapOptions
from bokeh.plotting import gmap

# output_file("gmap.html")
output_notebook()
center_lat = (unnormed_X["Latitude"].max() + unnormed_X["Latitude"].min()) / 2
center_lon = (unnormed_X["Longitude"].max() + unnormed_X["Longitude"].min()) / 2
map_options = GMapOptions(lat=center_lat, lng=center_lon, map_type="roadmap", zoom=11)

# For GMaps to function, Google requires you obtain and enable an API key:
#
#     https://developers.google.com/maps/documentation/javascript/get-api-key
#
# Replace the value below with your personal API key:
p = gmap(google_api, map_options, title="New York")

red_X = unnormed_X[unnormed_X['High Review Score'] == 1]
blue_X = unnormed_X[unnormed_X['High Review Score'] == 0]



p.circle(x="lon", y="lat", size=2, color="blue", legend_label='Low Review Score', alpha=0.2, source=ColumnDataSource(
    data=dict(lat=blue_X["Latitude"],
              lon=blue_X["Longitude"])
))

p.circle(x="lon", y="lat", size=2, color="red", legend_label='High Review Score', alpha=0.2, source=ColumnDataSource(
    data=dict(lat=red_X["Latitude"],
              lon=red_X["Longitude"])
))


show(p)

In [6]:
import warnings
from tqdm.notebook import tqdm



def plot_overlay(name):
    cls = classifiers[name]
    max_lat, min_lat = unnormed_X["Latitude"].max(), unnormed_X["Latitude"].min()
    max_lon, min_lon = unnormed_X["Longitude"].max(), unnormed_X["Longitude"].min()
    resolution = 100
    lats = np.linspace(min_lat, max_lat, resolution)
    lons = np.linspace(min_lon, max_lon, resolution)
    red_dots = {"lat": [], "lon": []}
    blue_dots = {"lat": [], "lon": []}
    dummy_data = np.expand_dims(X.mean(axis=0), 0)
    u_mean, s_var = scaler.mean_, scaler.var_
    for lat in tqdm(lats):
        for lon in lons:
            dummy_data[0, 3] = (lat - u_mean[3]) / s_var[3]
            dummy_data[0, 4] = (lon -  u_mean[4]) / s_var[4]
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                pred = cls.predict(dummy_data)
            if pred < 0.5:
                blue_dots["lat"].append(lat)
                blue_dots["lon"].append(lon)
            else:
                red_dots["lat"].append(lat)
                red_dots["lon"].append(lon)

    print(len(blue_dots["lat"]), len(red_dots["lat"]))
    w, h = (max_lat - min_lat) / resolution, (max_lon - min_lon) / resolution
    p = gmap(google_api, map_options, title=name)

    source = ColumnDataSource(data=blue_dots)

    # p.rect(x="lon", y="lat", width=10, height=10, fill_color="blue", fill_alpha=0.2, source=source)
    p.square(x="lon", y="lat", color="blue", legend_label='Low Review Score', alpha=0.1, source=source)

    source = ColumnDataSource(data=red_dots)

    # p.hex_tile(q="lon", r="lat", color="red", alpha=0.2, source=source)
    p.square(x="lon", y="lat", color="red", legend_label='High Review Score', alpha=0.1, source=source)

    show(p)

In [7]:
plot_overlay('Naive Bayes')

  0%|          | 0/100 [00:00<?, ?it/s]

0 10000


In [8]:
plot_overlay('KNN')

  0%|          | 0/100 [00:00<?, ?it/s]

2467 7533


In [9]:
plot_overlay('Logistic Regression')

  0%|          | 0/100 [00:00<?, ?it/s]

6896 3104


In [10]:
plot_overlay('Decision Tree')

  0%|          | 0/100 [00:00<?, ?it/s]

6457 3543


In [11]:
plot_overlay('Random Forest')

  0%|          | 0/100 [00:00<?, ?it/s]

10000 0
