In [1]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import FMClassifier
from pyspark.ml.feature import MinMaxScaler, StringIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# $example off$
from pyspark.sql import SparkSession


In [16]:
spark = SparkSession \
    .builder \
    .appName("FMClassifierExample") \
    .getOrCreate()

# $example on$
# Load and parse the data file, converting it to a DataFrame.


data = spark.read.option("delimiter", "\t").csv("./movie_sample.txt",inferSchema="true")

In [17]:
data.show()

+---+---+---+--------------------+---+---+---+---+
|_c0|_c1|_c2|                 _c3|_c4|_c5|_c6|_c7|
+---+---+---+--------------------+---+---+---+---+
|  1|  1|  1|186,0,0,0,0,0,0,0...|  1|112|  2|  1|
|  1|  1|  1|186,0,0,0,0,0,0,0...|  1| 38|  5|  0|
|  1|  1|  1|186,0,0,0,0,0,0,0...|  1|151|  7|  0|
|  1|  1|  1|186,0,0,0,0,0,0,0...|  1| 77|  6|  0|
|  1|  1|  1|186,0,0,0,0,0,0,0...|  1|188|  9|  0|
|  1|  1|  1|186,0,0,0,0,0,0,0...|  1|164|  9|  0|
|  1|  1|  1|112,186,0,0,0,0,0...|  2| 84|  7|  1|
|  1|  1|  1|112,186,0,0,0,0,0...|  2| 76|  8|  0|
|  1|  1|  1|112,186,0,0,0,0,0...|  2|162|  6|  0|
|  1|  1|  1|112,186,0,0,0,0,0...|  2| 94|  1|  0|
|  1|  1|  1|112,186,0,0,0,0,0...|  2| 73|  5|  0|
|  1|  1|  1|112,186,0,0,0,0,0...|  2| 49|  7|  0|
|  1|  1|  1|84,112,186,0,0,0,...|  3| 52|  4|  1|
|  1|  1|  1|84,112,186,0,0,0,...|  3|205|  3|  0|
|  1|  1|  1|84,112,186,0,0,0,...|  3|100|  2|  0|
|  1|  1|  1|84,112,186,0,0,0,...|  3|141|  9|  0|
|  1|  1|  1|84,112,186,0,0,0,.

In [18]:
data.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- _c1: integer (nullable = true)
 |-- _c2: integer (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: integer (nullable = true)
 |-- _c5: integer (nullable = true)
 |-- _c6: integer (nullable = true)
 |-- _c7: integer (nullable = true)



In [10]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [19]:
features_cols =['_c0', '_c1', '_c2', '_c4', '_c5', '_c6']


print(features_cols)

['_c0', '_c1', '_c2', '_c4', '_c5', '_c6']


In [20]:
# 组合特征到向量
assembler = VectorAssembler(
    inputCols=features_cols,
    outputCol="raw_features")

In [21]:
data_df2 = assembler.transform(data)

In [22]:
data_df2.printSchema()
data_df2.show(5, False)

root
 |-- _c0: integer (nullable = true)
 |-- _c1: integer (nullable = true)
 |-- _c2: integer (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: integer (nullable = true)
 |-- _c5: integer (nullable = true)
 |-- _c6: integer (nullable = true)
 |-- _c7: integer (nullable = true)
 |-- raw_features: vector (nullable = true)

+---+---+---+-----------------------------------------------------------------------------------------------------+---+---+---+---+---------------------------+
|_c0|_c1|_c2|_c3                                                                                                  |_c4|_c5|_c6|_c7|raw_features               |
+---+---+---+-----------------------------------------------------------------------------------------------------+---+---+---+---+---------------------------+
|1  |1  |1  |186,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0|1  |112|2  |1  |[1.0,1.0,1.0,1.0,112.0,2.0]|
|1  |1  |1  |186,0,0,0,

In [25]:
# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="_c7", outputCol="indexedLabel").fit(data)
# Scale features.
featureScaler = MinMaxScaler(inputCol="raw_features", outputCol="scaledFeatures").fit(data_df2)

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data_df2.randomSplit([0.7, 0.3])

# Train a FM model.
fm = FMClassifier(labelCol="indexedLabel", featuresCol="scaledFeatures", stepSize=0.001)

# Create a Pipeline.
pipeline = Pipeline(stages=[labelIndexer, featureScaler, fm])

# Train model.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "indexedLabel").show(5)

+----------+------------+
|prediction|indexedLabel|
+----------+------------+
|       0.0|         1.0|
|       0.0|         0.0|
|       0.0|         1.0|
|       0.0|         0.0|
|       0.0|         0.0|
+----------+------------+
only showing top 5 rows



In [26]:
# Select (prediction, true label) and compute test accuracy
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = %g" % accuracy)

fmModel = model.stages[2]
print("Factors: " + str(fmModel.factors))  # type: ignore
print("Linear: " + str(fmModel.linear))  # type: ignore
print("Intercept: " + str(fmModel.intercept))  # type: ignore


Test set accuracy = 0.819071
Factors: DenseMatrix([[ 0.1330383 , -0.11864123,  0.11330017, -0.13629388,  0.11957772,
              -0.11329133, -0.14346172, -0.07030867],
             [ 0.12839952, -0.12809056,  0.12460184,  0.13212282, -0.13688827,
               0.12732801,  0.12477254, -0.12811037],
             [-0.13290206,  0.12658505, -0.12257014, -0.13740224,  0.11728812,
              -0.13022489,  0.11531875,  0.14038232],
             [-0.09194098,  0.13996452, -0.13189047,  0.13261386,  0.10930607,
              -0.13070978, -0.11144282,  0.12484725],
             [-0.00859958, -0.11940727, -0.1232649 ,  0.1375973 ,  0.1111191 ,
               0.10445189,  0.12537354,  0.12652078],
             [-0.12505186,  0.13820904,  0.07334647, -0.13296544, -0.14616981,
               0.10634416, -0.13609166, -0.13200748]])
Linear: [-0.09134636110643793,-0.09175728099843432,-0.09134636110643793,-0.09241335370239907,-0.09269074096386501,-0.09251396751381845]
Intercept: -0.0930581296416