In [21]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StringIndexer, IndexToString, OneHotEncoder
from pyspark.ml.classification import LogisticRegression, GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
# from pyspark.sql import functions as F

In [2]:
team = "team3"
warehouse = "project/hive/warehouse"

spark = SparkSession.builder \
    .appName(f"{team} - Spark ML") \
    .master("yarn") \
    .config("hive.metastore.uris", "thrift://hadoop-02.uni.innopolis.ru:9883") \
    .config("spark.sql.warehouse.dir", warehouse) \
    .enableHiveSupport() \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/18 15:51:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/05/18 15:51:33 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/05/18 15:51:34 WARN DomainSocketFactory: The short-circuit local reads feature cannot be used because libhadoop cannot be loaded.
25/05/18 15:51:34 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


In [3]:
transactions = spark.read.format("avro").table("team3_projectdb.transactions")
cash_withdrawals = spark.read.format("avro").table("team3_projectdb.cash_withdrawals")
locations = spark.read.format("avro").table("team3_projectdb.locations")

In [4]:
data = transactions.join(cash_withdrawals, ["h3_09", "customer_id"], "inner")
data = data.join(locations, ["h3_09"], "inner").drop("lat", "lon")

In [5]:
original_features = [
    "datetime_id", "count", "sum", 
    "avg", "min", "max", "std",
    "count_distinct"
]

In [6]:
data = data.na.drop(subset=original_features)

In [20]:
string_columns = ["h3_09"]

In [7]:
label_indexer = StringIndexer(inputCol="h3_09", outputCol="label").fit(data)
data = label_indexer.transform(data)

                                                                                

In [29]:
encoders = [
    OneHotEncoder(inputCol=f"label", outputCol=f"{column}_encoded")
    for column in string_columns
]

In [30]:
encoding_pipeline = Pipeline(stages=encoders)
encoded_data = encoding_pipeline.fit(data).transform(data)

In [31]:
data.head()

Row(h3_09='8911aa7a6d3ffff', customer_id=107, transaction_pk=61, count=4, sum=3630.75, avg=907.6875, min=423.0, max=1825.9200439453125, std=640.25927734375, count_distinct=2, datetime_id=3, mcc_code=13, label=122.0)

In [32]:
data.drop("h3_09")

DataFrame[customer_id: bigint, transaction_pk: bigint, count: smallint, sum: float, avg: float, min: float, max: float, std: float, count_distinct: smallint, datetime_id: smallint, mcc_code: smallint, label: double]

In [33]:
mcc_indexer = StringIndexer(inputCol="mcc_code", outputCol="mcc_code_index")

In [34]:
feature_cols = original_features + ["mcc_code_index"]
assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features",
    handleInvalid="skip"
)

In [35]:
lr = LogisticRegression(
    featuresCol="features",
    labelCol="label",
    predictionCol="prediction",
    maxIter=10
)

In [36]:
gbt = GBTClassifier(
    featuresCol="features",
    labelCol="label",
    predictionCol="prediction",
    maxIter=50,
    maxDepth=5
)

In [37]:
pipeline_lr = Pipeline(stages=[mcc_indexer, assembler, lr])
pipeline_gbt = Pipeline(stages=[mcc_indexer, assembler, gbt])

In [38]:
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

In [39]:
train_data.head()

                                                                                

Row(h3_09='8911818610bffff', customer_id=1924, transaction_pk=1143088, count=2, sum=6395.0, avg=3197.5, min=1198.0, max=5197.0, std=2827.719970703125, count_distinct=2, datetime_id=2, mcc_code=10, label=832.0)

In [40]:
paramGrid_gbt = ParamGridBuilder() \
    .addGrid(gbt.maxDepth, [3, 5]) \
    .addGrid(gbt.maxIter, [50, 100]) \
    .build()

In [41]:
evaluator_accuracy = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="accuracy"
)

evaluator_f1 = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="f1"
)

In [42]:
cv_gbt = CrossValidator(
    estimator=pipeline_gbt,
    estimatorParamMaps=paramGrid_gbt,
    evaluator=evaluator_f1,
    numFolds=3,
    parallelism=4
)

In [43]:
lr_model = pipeline_lr.fit(train_data)

                                                                                

In [46]:
# cv_gbt_model = cv_gbt.fit(train_data)

In [44]:
lr_predictions = lr_model.transform(test_data)

In [None]:
gbt_predictions = cv_gbt_model.transform(test_data)

In [47]:
print("Logistic Regression Results:")
print(f"Accuracy: {evaluator_accuracy.evaluate(lr_predictions)}")
print(f"F1-Score: {evaluator_f1.evaluate(lr_predictions)}\n")

print("GBT Results:")
print(f"Accuracy: {evaluator_accuracy.evaluate(gbt_predictions)}")
print(f"F1-Score: {evaluator_f1.evaluate(gbt_predictions)}")

Logistic Regression Results:


                                                                                

Accuracy: 0.16850529958901148




F1-Score: 0.055522637971441635



                                                                                

In [None]:
label_converter = IndexToString(
    inputCol="prediction",
    outputCol="predicted_h3_09",
    labels=label_indexer.labels
)

final_predictions = label_converter.transform(gbt_predictions)
final_predictions.select("h3_09", "predicted_h3_09").show(5)

In [None]:
spark.stop()