In [1]:
import pyspark.sql.functions as F
from pyspark.ml import Pipeline, Transformer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import (
    OneHotEncoder,
    SQLTransformer,
    StringIndexer,
    VectorAssembler,
    Word2Vec,
)
from pyspark.ml.regression import LinearRegression, RandomForestRegressor
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.functions import (
    col,
    cos,
    radians,
    sin,
    sqrt,
    to_date,
)

In [2]:
warehouse = "project/hive/warehouse"

spark = (
    SparkSession.builder.appName("ML_team1")
    .master("yarn")
    .config("hive.metastore.uris", "thrift://hadoop-02.uni.innopolis.ru:9883")
    .config("spark.sql.warehouse.dir", warehouse)
    .config("spark.sql.avro.compression.codec", "snappy")
    .config("spark.hadoop.hive.metastore.client.socket.timeout", "300")
    .enableHiveSupport()
    .getOrCreate()
)
sc = spark.sparkContext
sc.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/18 21:00:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/05/18 21:00:22 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/05/18 21:00:23 WARN DomainSocketFactory: The short-circuit local reads feature cannot be used because libhadoop cannot be loaded.
25/05/18 21:00:23 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


In [3]:
spark.sql("USE team1_projectdb")
spark.sql("SHOW TABLES").show()

+---------------+------------+-----------+
|      namespace|   tableName|isTemporary|
+---------------+------------+-----------+
|team1_projectdb|  q1_results|      false|
|team1_projectdb|  q2_results|      false|
|team1_projectdb|  q3_results|      false|
|team1_projectdb|  q4_results|      false|
|team1_projectdb|  q5_results|      false|
|team1_projectdb|records_part|      false|
+---------------+------------+-----------+



In [4]:
df = spark.read.format("parquet").table("team1_projectdb.records_part")
df = df.dropna(subset=["review_scores_rating"])
df = df.sample(0.001, seed=15)

df = df.withColumn("host_since", to_date(col("host_since").cast("string"), "yyyyMMdd"))
df = df.filter(col("review_scores_rating").isNotNull())

In [5]:
split_amenities = SQLTransformer(
    statement="""
    SELECT *, split(coalesce(amenities, ''), ',\\s*') AS amenities_tokens FROM __THIS__
"""
)

word2vec = Word2Vec(
    inputCol="amenities_tokens",
    outputCol="amenities_vec",
    vectorSize=100,  # размерность вектора
    minCount=1,
)

In [6]:
class GeoToECEFTransformer(Transformer):
    def __init__(self, lat_col="latitude", lon_col="longitude", alt_col=None) -> None:
        super().__init__()
        self.lat_col = lat_col
        self.lon_col = lon_col
        self.alt_col = alt_col

    def _transform(self, df: DataFrame) -> DataFrame:
        # Constants
        a = 6378137.0
        e_sq = 6.69437999014e-3

        # Use altitude if available, else 0
        if self.alt_col and self.alt_col in df.columns:
            alt = col(self.alt_col)
        else:
            alt = F.lit(0.0)

        lat_rad = radians(col(self.lat_col))
        lon_rad = radians(col(self.lon_col))

        N = a / sqrt(1 - e_sq * sin(lat_rad) ** 2)

        x = (N + alt) * cos(lat_rad) * cos(lon_rad)
        y = (N + alt) * cos(lat_rad) * sin(lon_rad)
        z = (N * (1 - e_sq) + alt) * sin(lat_rad)

        return df.withColumn("x", x).withColumn("y", y).withColumn("z", z)

In [7]:
geo_transformer = GeoToECEFTransformer()

df = geo_transformer.transform(df)

In [8]:
categorical_cols = [
    "host_response_time",
    "neighbourhood",
    "property_type",
    "room_type",
    "bed_type",
    "cancellation_policy",
    "month",
]

boolean_cols = [
    "host_is_superhost",
    "host_has_profile_pic",
    "host_identity_verified",
    "instant_bookable",
    "require_guest_profile_picture",
    "kitchen",
    "wifi",
    "essentials",
    "tv",
    "air_conditioning",
    "elevator",
    "washer",
    "hangers",
    "iron",
    "laptop_friendly_workspace",
    "family_kid_friendly",
    "hot_water",
    "cable_tv",
    "free_parking_on_premises",
    "hair_dryer",
    "smoking_allowed",
    "doorman",
    "dishes_and_silverware",
    "buzzer_wireless_intercom",
    "refrigerator",
]

numerical_cols = [
    "x",
    "y",
    "z",
    "accommodates",
    "bathrooms",
    "bedrooms",
    "beds",
    "price",
    "security_deposit",
    "cleaning_fee",
    "guests_included",
    "extra_people",
    "minimum_nights",
    "maximum_nights",
]

In [9]:
indexers = [
    StringIndexer(inputCol=col, outputCol=col + "_index", handleInvalid="keep")
    for col in categorical_cols
]
encoders = [
    OneHotEncoder(inputCol=col + "_index", outputCol=col + "_vec")
    for col in categorical_cols
]

# Список фич
encoded_cols = [encoder.getOutputCol() for encoder in encoders]
assembler_inputs = encoded_cols + boolean_cols + numerical_cols
assembler_inputs = encoded_cols + boolean_cols + numerical_cols + ["amenities_vec"]

assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features")

In [10]:
pipeline = Pipeline(
    stages=[
        split_amenities,
        word2vec,
        geo_transformer,
        *indexers,
        *encoders,
        assembler,
    ]
)

pipeline_model = pipeline.fit(df)
df_prepared = pipeline_model.transform(df)

                                                                                

In [11]:
train_df, test_df = df_prepared.randomSplit([0.8, 0.2], seed=42)

train_df.select("features", "review_scores_rating").write.mode("overwrite").json(
    "project/data/train",
)
test_df.select("features", "review_scores_rating").write.mode("overwrite").json(
    "project/data/test",
)

                                                                                

## MODEL 1

In [12]:
lr = LinearRegression(featuresCol="features", labelCol="review_scores_rating")

In [13]:
paramGrid = (
    ParamGridBuilder()
    .addGrid(lr.regParam, [0.01, 0.1, 1.0])
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
    .build()
)

evaluator = RegressionEvaluator(
    labelCol="review_scores_rating",
    predictionCol="prediction",
    metricName="rmse",
)
r2_evaluator = RegressionEvaluator(
    labelCol="target",  # замени на свою колонку таргета
    predictionCol="prediction",
    metricName="r2",
)

mae_evaluator = RegressionEvaluator(
    labelCol="target",  # замени на свою колонку таргета
    predictionCol="prediction",
    metricName="mae",
)
cv = CrossValidator(
    estimator=lr,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=3,
)

In [14]:
cv_model = cv.fit(train_df)
best_model = cv_model.bestModel
best_model.save("project/models/model1")

                                                                                

In [15]:
predictions = best_model.transform(test_df)

predictions.select("review_scores_rating", "prediction").coalesce(1).write.mode(
    "overwrite",
).csv("project/output/model1_predictions", header=True)

rmse = evaluator.evaluate(predictions)
r2 = r2_evaluator.evaluate(predictions)
mae = mae_evaluator.evaluate(predictions)

                                                                                

## MODEL 2

In [16]:
rf = RandomForestRegressor(
    featuresCol="features",
    labelCol="review_scores_rating",
    seed=42,
)

In [17]:
paramGrid_rf = (
    ParamGridBuilder()
    .addGrid(rf.numTrees, [5, 10])
    .addGrid(rf.maxDepth, [5, 10])
    .build()
)

In [18]:
evaluator_rf = RegressionEvaluator(
    labelCol="review_scores_rating",
    predictionCol="prediction",
    metricName="rmse",
)

cv_rf = CrossValidator(
    estimator=rf,
    estimatorParamMaps=paramGrid_rf,
    evaluator=evaluator_rf,
    numFolds=3,
)

In [19]:
cv_model_rf = cv_rf.fit(train_df)
best_model_rf = cv_model_rf.bestModel
best_model_rf.save("project/models/model2")

                                                                                

In [20]:
predictions_rf = best_model_rf.transform(test_df)

predictions_rf.select("review_scores_rating", "prediction").coalesce(1).write.mode(
    "overwrite",
).csv("project/output/model2_predictions", header=True)

                                                                                

In [21]:
rmse_rf = evaluator_rf.evaluate(predictions_rf)
r2_rf = r2_evaluator.evaluate(predictions_rf)
mae_rf = mae_evaluator.evaluate(predictions_rf)

                                                                                

In [22]:
models = [
    [str(best_model), rmse, r2, mae],
    [str(best_model_rf), rmse_rf, r2_rf, mae_rf],
]

# temp = list(map(list, models.items()))
df = spark.createDataFrame(models, ["model", "RMSE", "R2", "MAE"])
df.show(truncate=False)

# Save it to HDFS
df.coalesce(1).write.mode("overwrite").format("csv").option("sep", ",").option(
    "header",
    "true",
).save("project/output/evaluation.csv")

                                                                                

+-------------------------------------------------------------------------------------------------+-----------------+
|model                                                                                            |RMSE             |
+-------------------------------------------------------------------------------------------------+-----------------+
|LinearRegressionModel: uid=LinearRegression_3fbe6bd617c4, numFeatures=217                        |7.343552085207993|
|RandomForestRegressionModel: uid=RandomForestRegressor_63f978ac4f4f, numTrees=10, numFeatures=217|8.082929035332674|
+-------------------------------------------------------------------------------------------------+-----------------+

