# Data Preparation
Prepare data sets for training and inference.

In [0]:
import time

In [0]:
table_names = {
  "train": "prepared_training_sets",
  "test": "prepared_testing_sets"
}

query_adjustments = {
    "unmodified": "",
    "manhattan_only": "where District = 'Manhattan'",
    "manhattan_only_no_marble_hill": "where District = 'Manhattan' and Neighborhood != 'Marble Hill'"
}

test_set_query = {
  "test": ["s", "join workspace.rental_predictions.sample_submission as s on (s.ID = t.ID)"],
  "train": ["t", ""]
}

data_set    = dbutils.widgets.get("data_set") if "data_set" in dbutils.widgets.getAll() else "train"
preparation = dbutils.widgets.get("preparation") if "preparation" in dbutils.widgets.getAll() else "unmodified"
job_id      = dbutils.widgets.get("job_id") if "job_id" in dbutils.widgets.getAll() else -1
run_id      = dbutils.widgets.get("run_id") if "run_id" in dbutils.widgets.getAll() else -1

In [0]:
base_query = f"""
    select 
    t.ID,
    t.District,
    t.Neighborhood,
    t.PropertyType,
    t.CancellationPolicy,
    t.Accommodates,
    t.RoomType,
    round(t.Bathrooms/0.5)*0.5 as Bathrooms_rounded,
    round(t.Bedrooms) as Bedrooms_rounded,
    t.CleaningFee,
    t.Latitude,
    t.Longitude,
    t.ReviewRating,
    {test_set_query[data_set][0]}.Price,
    '{preparation}' as data_set_preparation,
    current_timestamp() as snapshot_timestamp,
    {job_id} as job_id,
    {run_id} as run_id
    from workspace.rental_predictions.{data_set} as t
    {test_set_query[data_set][1]}
    {query_adjustments[preparation]}
"""
data = spark.sql(base_query)
display(data)

In [0]:
full_table_name = f"workspace.rental_predictions.{table_names[data_set]}"
attempts = 0
max_attempts = 10

while attempts < max_attempts:
    try:
        data.write.mode("append").saveAsTable(full_table_name)
        break
    except Exception as e:
        attempts += 1
        if attempts == max_attempts:
            raise
        time.sleep(2)