# Imports

In [53]:
from pathlib import Path

import mlflow
import mlflow.sklearn
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StringType, StructField
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split

  and should_run_async(code)


In [5]:
spark = (SparkSession
         .builder
         .appName('Distributed-Training')
         .getOrCreate())
spark

  and should_run_async(code)


# Data

In [6]:
DATA_PATH = Path('../data')
%ls {DATA_PATH}

2015-summary.csv         [1m[36msf-airbnb-clean.parquet[m[m/ sf-airbnb.csv


In [20]:
df = pd.read_parquet(DATA_PATH / 'sf-airbnb-clean.parquet')
df.head()

  and should_run_async(code)


Unnamed: 0,host_is_superhost,cancellation_policy,instant_bookable,host_total_listings_count,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,...,bedrooms_na,bathrooms_na,beds_na,review_scores_rating_na,review_scores_accuracy_na,review_scores_cleanliness_na,review_scores_checkin_na,review_scores_communication_na,review_scores_location_na,review_scores_value_na
0,t,moderate,t,1.0,Western Addition,37.76931,-122.43386,Apartment,Entire home/apt,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,f,strict_14_with_grace_period,f,2.0,Bernal Heights,37.74511,-122.42102,Apartment,Entire home/apt,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,f,strict_14_with_grace_period,f,10.0,Haight Ashbury,37.76669,-122.4525,Apartment,Private room,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,f,strict_14_with_grace_period,f,10.0,Haight Ashbury,37.76487,-122.45183,Apartment,Private room,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,f,strict_14_with_grace_period,f,2.0,Western Addition,37.77525,-122.43637,House,Entire home/apt,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
df[[col for col in df.columns if df[col].dtype != 'object']].head(2).T

  and should_run_async(code)


Unnamed: 0,0,1
host_total_listings_count,1.0,2.0
latitude,37.76931,37.74511
longitude,-122.43386,-122.42102
accommodates,3.0,5.0
bathrooms,1.0,1.0
bedrooms,1.0,2.0
beds,2.0,3.0
minimum_nights,1.0,30.0
number_of_reviews,180.0,111.0
review_scores_rating,97.0,98.0


In [23]:
num_cols = [col for col in df.columns if df[col].dtype != 'object']
num_cols

  and should_run_async(code)


['host_total_listings_count',
 'latitude',
 'longitude',
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'minimum_nights',
 'number_of_reviews',
 'review_scores_rating',
 'review_scores_accuracy',
 'review_scores_cleanliness',
 'review_scores_checkin',
 'review_scores_communication',
 'review_scores_location',
 'review_scores_value',
 'price',
 'bedrooms_na',
 'bathrooms_na',
 'beds_na',
 'review_scores_rating_na',
 'review_scores_accuracy_na',
 'review_scores_cleanliness_na',
 'review_scores_checkin_na',
 'review_scores_communication_na',
 'review_scores_location_na',
 'review_scores_value_na']

In [24]:
train_x, test_x, train_y, test_y = train_test_split(df[num_cols], df['price'], random_state=42)
train_x.shape, train_y.shape, test_x.shape, test_y.shape

  and should_run_async(code)


((5359, 27), (5359,), (1787, 27), (1787,))

# MLflow

In [25]:
with mlflow.start_run(run_name='sklearn-rf-model') as run:
    rf = RandomForestRegressor(n_estimators=100, max_depth=10)
    rf.fit(train_x, train_y)
    preds = rf.predict(test_x)
    
    # Log hyperparameters
    mlflow.log_param('num_trees', 100)
    mlflow.log_param('max_depth', 10)
    
    # Log model
    mlflow.sklearn.log_model(rf, 'random-forest-model')
    
    # Log metrics
    mlflow.log_metric('mse', mean_squared_error(test_y, preds))
    mlflow.log_metric('mae', mean_absolute_error(test_y, preds))
    mlflow.log_metric('r2', r2_score(test_y, preds))

  and should_run_async(code)


In [26]:
!mlflow ui

  and should_run_async(code)


[2020-10-06 11:23:26 -0500] [46123] [INFO] Starting gunicorn 20.0.4
[2020-10-06 11:23:26 -0500] [46123] [INFO] Listening at: http://127.0.0.1:5000 (46123)
[2020-10-06 11:23:26 -0500] [46123] [INFO] Using worker: sync
[2020-10-06 11:23:26 -0500] [46125] [INFO] Booting worker with pid: 46125
^C
[2020-10-06 11:23:56 -0500] [46123] [INFO] Handling signal: int
[2020-10-06 11:23:56 -0500] [46125] [INFO] Worker exiting (pid: 46125)


In [27]:
spark_df = spark.createDataFrame(train_x)

  and should_run_async(code)


In [34]:
run.info.run_id

'8fd5c699328c4698abef73eb795f8b85'

In [67]:
def predict(iterator):
    model_path = f'runs:/{run.info.run_id}/random-forest-model'
    model = mlflow.sklearn.load_model(model_path)
    for features in iterator:
        yield pd.concat([features.reset_index(), pd.Series(model.predict(features), name='prediction')], axis=1)


In [68]:
schema = """
`host_total_listings_count` DOUBLE,`neighbourhood_cleansed` BIGINT,`latitude` DOUBLE,
`longitude` DOUBLE,`property_type` BIGINT,`room_type` BIGINT,`accommodates` DOUBLE,
`bathrooms` DOUBLE,`bedrooms` DOUBLE,`beds` DOUBLE,`bed_type` BIGINT,`minimum_nights` DOUBLE,
`number_of_reviews` DOUBLE,`review_scores_rating` DOUBLE,`review_scores_accuracy` DOUBLE,
`review_scores_cleanliness` DOUBLE,`review_scores_checkin` DOUBLE,`review_scores_communication` DOUBLE,
`review_scores_location` DOUBLE,`review_scores_value` DOUBLE, prediction DOUBLE
"""

In [69]:
pred_df = spark_df.mapInPandas(predict, schema=schema)
pred_df

DataFrame[prediction: double]