In [1]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import FMClassifier
from pyspark.ml.feature import MinMaxScaler, StringIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# $example off$
from pyspark.sql import SparkSession


In [3]:
spark = SparkSession \
    .builder \
    .appName("FMClassifierExample") \
    .getOrCreate()

# $example on$
# Load and parse the data file, converting it to a DataFrame.
data = spark.read.format("libsvm").load("./sample_libsvm_data.txt")

In [4]:
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)
# Scale features.
featureScaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures").fit(data)

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

# Train a FM model.
fm = FMClassifier(labelCol="indexedLabel", featuresCol="scaledFeatures", stepSize=0.001)

# Create a Pipeline.
pipeline = Pipeline(stages=[labelIndexer, featureScaler, fm])

# Train model.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "indexedLabel", "features").show(5)

# Select (prediction, true label) and compute test accuracy
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = %g" % accuracy)

fmModel = model.stages[2]
print("Factors: " + str(fmModel.factors))  # type: ignore
print("Linear: " + str(fmModel.linear))  # type: ignore

+----------+------------+--------------------+
|prediction|indexedLabel|            features|
+----------+------------+--------------------+
|       1.0|         1.0|(692,[100,101,102...|
|       1.0|         1.0|(692,[122,123,124...|
|       1.0|         1.0|(692,[122,123,148...|
|       1.0|         1.0|(692,[124,125,126...|
|       1.0|         1.0|(692,[125,126,127...|
+----------+------------+--------------------+
only showing top 5 rows

Test set accuracy = 1
Factors: DenseMatrix([[ 0.00361783,  0.00370736, -0.00895524, ...,  0.01505403,
              -0.00986316, -0.01014237],
             [-0.00473607, -0.00099352, -0.00425292, ..., -0.00298974,
              -0.01663548,  0.02146898],
             [ 0.00921285, -0.00486104,  0.00923078, ...,  0.02295305,
               0.00903289,  0.01458809],
             ...,
             [-0.00767841, -0.00852137, -0.0248619 , ...,  0.02163463,
              -0.02875074,  0.03058925],
             [-0.01823533, -0.00201777, -0.01428901, ..