In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from pyspark.sql.functions import col, unix_timestamp, lag, when, count, year, month, dayofmonth, dayofweek, hour
from pyspark.ml.feature import QuantileDiscretizer, OneHotEncoder, StringIndexer
from pyspark.ml import Pipeline

In [2]:
spark = SparkSession.builder.appName("NequiProject").getOrCreate()

In [3]:
sample006 = spark.read.parquet('./sample_data_0006_part_00.parquet')
sample007 = spark.read.parquet('./sample_data_0007_part_00.parquet')
samples = sample006.union(sample007)

In [4]:
samples = samples.na.drop()
samples = samples.dropDuplicates()

In [5]:
window_spec = Window().partitionBy("user_id").orderBy("transaction_date")
samples = samples.withColumn("time_diff",unix_timestamp("transaction_date") - unix_timestamp(lag("transaction_date").over(window_spec)))
samples = samples.withColumn("Y",when((col("time_diff") < 86400) | (lag("time_diff").over(window_spec) < 86400), 1).otherwise(0)).cache()

In [6]:
quantile_discretizer = QuantileDiscretizer(
    inputCol="transaction_amount",
    outputCol="transaction_category",
    numBuckets=3)

In [7]:
pipeline = Pipeline(stages=[quantile_discretizer])
pipeline_model = pipeline.fit(samples)
samples = pipeline_model.transform(samples)

In [8]:
sub_fq = samples.groupBy("subsidiary").agg(count("*").alias("subsidiary_count"))
samples = samples.join(sub_fq, on="subsidiary", how="left").cache()

In [9]:
samples = samples.withColumn("year", year(col("transaction_date"))) \
                 .withColumn("month", month(col("transaction_date"))) \
                 .withColumn("day", dayofmonth(col("transaction_date"))) \
                 .withColumn("day_of_week", dayofweek(col("transaction_date"))) \
                 .withColumn("hour", hour(col("transaction_date")))

In [10]:
samples = samples.withColumn("merchant_index",
                          when(col("merchant_id") == "075d178871d8d48502bf1f54887e52fe", 1)
                          .when(col("merchant_id") == "817d18cd3c31e40e9bff0566baae7758", 2)
                          .when(col("merchant_id") == "838a8fa992a4aa2fb5a0cf8b15b63755", 3)
                          .otherwise(col("merchant_id")))

In [11]:
samples = samples.withColumn("type_index",
                          when(col("transaction_type") == "DEBITO", 1)
                          .when(col("transaction_type") == "CREDITO", 2)
                          .otherwise(col("transaction_type"))).cache()

In [12]:
samples = samples.withColumn("merchant_index", col("merchant_index").cast("int"))
samples = samples.withColumn("type_index", col("type_index").cast("int"))

In [13]:
one_hot_encoder = OneHotEncoder(inputCol="merchant_index", outputCol="merchant_onehot")

pipeline = Pipeline(stages=[one_hot_encoder])
model = pipeline.fit(samples)
samples = model.transform(samples).cache()

In [14]:
one_hot_encoder = OneHotEncoder(inputCol="type_index", outputCol="type_onehot")

pipeline = Pipeline(stages=[one_hot_encoder])
model = pipeline.fit(samples)
samples = model.transform(samples).cache()

In [15]:
samples = samples.drop(*["time_diff","_id","transaction_amount","account_number","user_id","transaction_date","subsidiary"]).cache()

In [17]:
samples = samples.drop(*["merchant_id","transaction_type","merchant_index","type_index"]).cache()

In [18]:
samples.show()

+---+--------------------+----------------+----+-----+---+-----------+----+---------------+-------------+
|  Y|transaction_category|subsidiary_count|year|month|day|day_of_week|hour|merchant_onehot|  type_onehot|
+---+--------------------+----------------+----+-----+---+-----------+----+---------------+-------------+
|  1|                 0.0|            1247|2021|    7| 26|          2|  18|  (3,[2],[1.0])|(2,[1],[1.0])|
|  0|                 1.0|            1247|2021|   11| 22|          2|  16|  (3,[2],[1.0])|(2,[1],[1.0])|
|  0|                 1.0|            1247|2021|   11| 30|          3|  10|  (3,[2],[1.0])|(2,[1],[1.0])|
|  0|                 2.0|            1247|2021|    9| 24|          6|  11|  (3,[2],[1.0])|(2,[1],[1.0])|
|  0|                 2.0|            1247|2021|    8| 22|          1|  11|  (3,[2],[1.0])|(2,[1],[1.0])|
|  0|                 0.0|            1247|2021|    7| 15|          5|  11|  (3,[2],[1.0])|(2,[1],[1.0])|
|  0|                 2.0|            1247|202

In [19]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [20]:
feature_columns = [
    "transaction_category",
    "subsidiary_count",
    "year",
    "month",
    "day",
    "day_of_week",
    "hour",
    "merchant_onehot",
    "type_onehot"
]
label_column = "Y"

# Create a vector assembler
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

In [21]:
train_data, validation_data = samples.randomSplit([0.8, 0.2], seed=123)

In [22]:
rf = RandomForestClassifier(labelCol=label_column, featuresCol="features", seed=123)
gbt = GBTClassifier(labelCol=label_column, featuresCol="features", seed=123)

In [23]:
pipeline_rf = Pipeline(stages=[assembler, rf])
pipeline_gbt = Pipeline(stages=[assembler, gbt])

In [24]:
model_rf = pipeline_rf.fit(train_data)
model_gbt = pipeline_gbt.fit(train_data) # <-- Not enough resources, operation cancelled.

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.10/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: ignored

In [25]:
predictions_rf = model_rf.transform(validation_data)
# predictions_gbt = model_gbt.transform(validation_data)

In [26]:
evaluator = BinaryClassificationEvaluator(labelCol=label_column)
auc_rf = evaluator.evaluate(predictions_rf)
# auc_gbt = evaluator.evaluate(predictions_gbt)

In [27]:
print(f"Random Forest AUC: {auc_rf}")
# print(f"Gradient Boosting AUC: {auc_gbt}")

Random Forest AUC: 0.6556511878548554


In [None]:
model_rf.save('./RFC')