### 一、最大最小法

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
      .master("local[4]")\
      .appName("mlib")\
      .getOrCreate()

In [11]:
from pyspark.ml.linalg import Vectors

dataFrame = spark.createDataFrame([
      (0, Vectors.dense([1.0, 0.1, -1.0])),
      (1, Vectors.dense([2.0, 1.1, 1.0])),
      (2, Vectors.dense([3.0, 10.1, 3.0]))
   ]).toDF("id", "features")

In [12]:
dataFrame.show()

+---+--------------+
| id|      features|
+---+--------------+
|  0|[1.0,0.1,-1.0]|
|  1| [2.0,1.1,1.0]|
|  2|[3.0,10.1,3.0]|
+---+--------------+



In [13]:
from pyspark.ml.feature import MinMaxScaler

scaler = MinMaxScaler()\
      .setInputCol("features")\
      .setOutputCol("scaledFeatures")

In [15]:
# 计算汇总量，生成MinMaxScalerModel
scalerModel = scaler.fit(dataFrame)
# rescale each feature to range [min, max].
scaledData = scalerModel.transform(dataFrame)

In [16]:
# 输出到控制台
scaledData.select("features", "scaledFeatures").show()

+--------------+--------------+
|      features|scaledFeatures|
+--------------+--------------+
|[1.0,0.1,-1.0]|     (3,[],[])|
| [2.0,1.1,1.0]| [0.5,0.1,0.5]|
|[3.0,10.1,3.0]| [1.0,1.0,1.0]|
+--------------+--------------+



### 二、分类算法

In [33]:
from pyspark.ml.feature import VectorIndexer

# 将以 libsvm 格式存储的数据作为 DataFrame加载.
data = spark.read.format("libsvm").load("../data/mllib/sample_libsvm_data.txt")

In [7]:
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(215,[127,128,129...|
|  1.0|(215,[158,159,160...|
|  1.0|(215,[124,125,126...|
|  1.0|(215,[152,153,154...|
|  1.0|(215,[151,152,153...|
+-----+--------------------+



In [34]:
# 用VectorIndexer转换特征列
# 设置最大分类特征数为4
featureIndexer = VectorIndexer()\
      .setInputCol("features")\
      .setOutputCol("indexedFeatures")\
      .setMaxCategories(4)\
      .fit(data)

In [35]:
# 拆分成训练集和测试集(70%训练集，30%测试集). # 后一个参数表示固定随机种子
trainingData, testData = data.randomSplit((0.7, 0.3), 1)
trainingData.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[95,96,97,12...|
|  0.0|(692,[98,99,100,1...|
|  0.0|(692,[100,101,102...|
|  0.0|(692,[121,122,123...|
|  0.0|(692,[123,124,125...|
|  0.0|(692,[123,124,125...|
|  0.0|(692,[123,124,125...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[126,127,128...|
|  0.0|(692,[126,127,128...|
|  0.0|(692,[126,127,128...|
|  0.0|(692,[126,127,128...|
|  0.0|(692,[126,127,128...|
|  0.0|(692,[126,127,128...|
|  0.0|(692,[127,128,129...|
|  0.0|(692,[127,128,129...|
|  0.0|(692,[127,128,129...|
+-----+--------------------+
only showing top 20 rows



In [36]:
from pyspark.ml.regression import DecisionTreeRegressor

# 指定执行决策树分类算法的转换器（使用默认参数）
dt = DecisionTreeRegressor()\
      .setLabelCol("label")\
      .setFeaturesCol("indexedFeatures")

In [39]:
from pyspark.ml import Pipeline

# 组装成Pipeline
pipeline =  Pipeline()\
      .setStages((featureIndexer, dt))

# 训练模型.
model = pipeline.fit(trainingData)
# 用训练好的模型预测测试集的结果
predictions = model.transform(testData)
# 输出前10条数据
predictions.select("label","features").show(20)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[122,123,124...|
|  0.0|(692,[122,123,148...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[125,126,127...|
|  0.0|(692,[126,127,128...|
|  0.0|(692,[126,127,128...|
|  0.0|(692,[150,151,152...|
|  0.0|(692,[152,153,154...|
|  0.0|(692,[152,153,154...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[234,235,237...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[99,100,101,...|
|  1.0|(692,[123,124,125...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[125,126,127...|
|  1.0|(692,[127,128,129...|
|  1.0|(692,[127,128,155...|
+-----+--------------------+
only showing top 20 rows



In [38]:
testData.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[122,123,124...|
|  0.0|(692,[122,123,148...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[125,126,127...|
|  0.0|(692,[126,127,128...|
|  0.0|(692,[126,127,128...|
|  0.0|(692,[150,151,152...|
|  0.0|(692,[152,153,154...|
|  0.0|(692,[152,153,154...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[234,235,237...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[99,100,101,...|
|  1.0|(692,[123,124,125...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[125,126,127...|
|  1.0|(692,[127,128,129...|
|  1.0|(692,[127,128,155...|
+-----+--------------------+
only showing top 20 rows



### 三、随机森林

In [40]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import StringIndexer, IndexToString

# 用StringIndexer转换标签列.
labelIndexer = StringIndexer()\
      .setInputCol("label")\
      .setOutputCol("indexedLabel")\
      .fit(data)
# 用VectorIndexer转换特征列
# 设置最大分类特征数为4
featureIndexer = VectorIndexer()\
      .setInputCol("features")\
      .setOutputCol("indexedFeatures")\
      .setMaxCategories(4)\
      .fit(data)

# 拆分成训练集和测试集(70%训练集，30%测试集).
trainingData, testData = data.randomSplit((0.7, 0.3), 1)

# 指定执行随机森林分类算法的转换器
rf = RandomForestClassifier()\
      .setLabelCol("indexedLabel")\
      .setFeaturesCol("indexedFeatures")\
      .setNumTrees(10) #设置树的个数

# 用IndexToString把预测的索引列转换成原始标签列
labelConverter = IndexToString()\
      .setInputCol("prediction")\
      .setOutputCol("predictedLabel")\
      .setLabels(labelIndexer.labels)

In [41]:
# 组装成Pipeline.
pipeline = Pipeline()\
      .setStages((labelIndexer, featureIndexer, rf, labelConverter))
# 训练模型
model = pipeline.fit(trainingData)

# 用训练好的模型预测测试集的结果
predictions = model.transform(testData)

# 输出前10条数据
predictions.select("predictedLabel", "label", "features").show(10)

+--------------+-----+--------------------+
|predictedLabel|label|            features|
+--------------+-----+--------------------+
|           0.0|  0.0|(692,[122,123,124...|
|           0.0|  0.0|(692,[122,123,148...|
|           0.0|  0.0|(692,[124,125,126...|
|           0.0|  0.0|(692,[124,125,126...|
|           0.0|  0.0|(692,[125,126,127...|
|           0.0|  0.0|(692,[126,127,128...|
|           0.0|  0.0|(692,[126,127,128...|
|           0.0|  0.0|(692,[150,151,152...|
|           0.0|  0.0|(692,[152,153,154...|
|           0.0|  0.0|(692,[152,153,154...|
+--------------+-----+--------------------+
only showing top 10 rows



In [42]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# 计算精度和误差
evaluator = MulticlassClassificationEvaluator()\
      .setLabelCol("indexedLabel")\
      .setPredictionCol("prediction")\
      .setMetricName("accuracy")
accuracy = evaluator.evaluate(predictions)
print(r"输出误差 {}".format((1.0 - accuracy)))

输出误差 0.03703703703703709


In [43]:
from pyspark.ml.classification import RandomForestClassificationModel

# 从PipelineModel中取出决策树模型treeModel
rfModel = model.stages[2]
print("输出rfModel的决策过程:\n {}".format(rfModel.toDebugString))

输出rfModel的决策过程:
 RandomForestClassificationModel: uid=RandomForestClassifier_036d84300332, numTrees=10, numClasses=2, numFeatures=692
  Tree 0 (weight 1.0):
    If (feature 434 <= 70.5)
     Predict: 1.0
    Else (feature 434 > 70.5)
     Predict: 0.0
  Tree 1 (weight 1.0):
    If (feature 245 <= 16.0)
     If (feature 484 <= 5.0)
      Predict: 0.0
     Else (feature 484 > 5.0)
      Predict: 1.0
    Else (feature 245 > 16.0)
     Predict: 1.0
  Tree 2 (weight 1.0):
    If (feature 413 <= 4.0)
     If (feature 555 <= 58.5)
      If (feature 511 <= 15.0)
       Predict: 0.0
      Else (feature 511 > 15.0)
       Predict: 1.0
     Else (feature 555 > 58.5)
      Predict: 1.0
    Else (feature 413 > 4.0)
     Predict: 1.0
  Tree 3 (weight 1.0):
    If (feature 578 <= 6.5)
     If (feature 608 <= 5.0)
      Predict: 0.0
     Else (feature 608 > 5.0)
      Predict: 1.0
    Else (feature 578 > 6.5)
     If (feature 385 <= 4.0)
      If (feature 576 <= 70.5)
       Predict: 0.0
      Else (f

### 四、聚类

In [44]:
dataset = spark.read.format("libsvm").load("../data/mllib/sample_kmeans_data.txt")

In [45]:
from pyspark.ml.clustering import KMeans

# 训练模型.
# 设置分聚类数setK
kmeans = KMeans()\
      .setK(2)\
      .setSeed(1) # 设置随机种子

model = kmeans.fit(dataset)
# 做出预测
predictions = model.transform(dataset)

In [51]:
from pyspark.ml.evaluation import ClusteringEvaluator

# Evaluate clustering by computing Silhouette score
# 基于轮廓系数的聚类评价
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions)
print(r"轮廓系数 {}".format(silhouette))
# Shows the result.
print("输出聚类中心 ")
model.clusterCenters()

轮廓系数 0.9997530305375207
输出聚类中心 


[array([9.1, 9.1, 9.1]), array([0.1, 0.1, 0.1])]

### 五、协同过滤实操

In [2]:
from pyspark.sql.types import StringType, StructField, StructType, IntegerType, FloatType, LongType

root_path = "D:/大三下/Big_Data_Application_Spark/datasets/ml-25m/"

# 定义 spark df 的表结构
schema = StructType(
    [
        StructField('userId', IntegerType(), True),
        StructField('movieId', IntegerType(), True),
        StructField('rating', FloatType(), True),
        StructField('timestamp', LongType(), True),
    ]
)

ratings = spark.read\
      .option("header", True)\
      .csv(root_path + "ratings.csv", schema=schema)\
      .toDF("userId", "movieId", "rating", "timestamp")

In [3]:
ratings.show()
ratings

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|    296|   5.0|1147880044|
|     1|    306|   3.5|1147868817|
|     1|    307|   5.0|1147868828|
|     1|    665|   5.0|1147878820|
|     1|    899|   3.5|1147868510|
|     1|   1088|   4.0|1147868495|
|     1|   1175|   3.5|1147868826|
|     1|   1217|   3.5|1147878326|
|     1|   1237|   5.0|1147868839|
|     1|   1250|   4.0|1147868414|
|     1|   1260|   3.5|1147877857|
|     1|   1653|   4.0|1147868097|
|     1|   2011|   2.5|1147868079|
|     1|   2012|   2.5|1147868068|
|     1|   2068|   2.5|1147869044|
|     1|   2161|   3.5|1147868609|
|     1|   2351|   4.5|1147877957|
|     1|   2573|   4.0|1147878923|
|     1|   2632|   5.0|1147878248|
|     1|   2692|   5.0|1147869100|
+------+-------+------+----------+
only showing top 20 rows



DataFrame[userId: int, movieId: int, rating: float, timestamp: bigint]

In [4]:
schema = StructType(
    [
        StructField('movieId', IntegerType(), True),
        StructField('title', StringType(), True),
        StructField('genres', StringType(), True),
    ]
)
movies = spark.read\
      .option("header", True)\
      .csv(root_path + "movies.csv", schema=schema)\
      .toDF( "movieId", "title", "genres")

In [6]:
movie_count=ratings.select("movieId").distinct().count()

user_count=ratings.select("userId").distinct().count()

total_count=ratings.count()

print("一共有 {} 条数据，其中电影条数为 {} ，用户数为 {}".format(total_count, movie_count, user_count))

一共有 25000095 条数据，其中电影条数为 59047 ，用户数为 162541
一共有 25000095 条数据，其中电影条数为 59047 ，用户数为 162541


In [7]:
training,test = ratings.randomSplit((0.8, 0.2), 1)

In [8]:
from pyspark.ml.recommendation import ALS

# 设置ALS算法参数
"""
setMaxIter(5)//最大迭代次数
.setRegParam(0.01)//正则化参数,防止过拟合
.setUserCol("userId")//用户对应的列
.setItemCol("movieId")//商品对应的列
.setRatingCol("rating")//rating对应的列
.setColdStartStrategy("drop")//冷启动策略使用"drop"，不对NaN进行评估
"""
als = ALS()\
    .setMaxIter(5)\
    .setRegParam(0.01)\
    .setUserCol("userId")\
    .setItemCol("movieId")\
    .setRatingCol("rating")\
    .setColdStartStrategy("drop")

model = als.fit(training)
predictions = model.transform(test)

In [9]:
from pyspark.ml.evaluation import RegressionEvaluator

# 设置模型评价指标
evaluator = RegressionEvaluator()\
            .setMetricName("rmse")\
            .setLabelCol("rating")\
            .setPredictionCol("prediction")

In [10]:
rmse = evaluator.evaluate(predictions)
print(r"均方误差为：{}".format(rmse))

均方误差为：0.8164297056639642


In [11]:
# 每个用户推荐的前十个电影
userRecs = model.recommendForAllUsers(10)
# 每个电影推荐的十个用户
movieRecs = model.recommendForAllItems(10)
# 为指定的用户组推荐top 10的电影
users = ratings.select("userId").distinct().limit(3)
userSubsetRecs = model.recommendForUserSubset(users, 10)
# 为指定的电影组推荐top 10的用户
ms = ratings.select("movieId").distinct().limit(3)
movieSubSetRecs = model.recommendForItemSubset(ms, 10)



In [None]:
userRecs.show(10)
movieRecs.show(10)
userSubsetRecs.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|     1|[{153066, 18.7691...|
|     6|[{118542, 13.8124...|
|    12|[{198537, 12.1441...|
|    13|[{153002, 13.0355...|
|    16|[{153002, 13.4138...|
|    22|[{160238, 21.5196...|
|    26|[{153002, 13.2202...|
|    27|[{167534, 16.3352...|
|    28|[{154634, 16.3481...|
|    31|[{170207, 8.12074...|
+------+--------------------+
only showing top 10 rows



In [None]:
userSubsetRecs.foreach(row=>println(row))
movieSubSetRecs.foreach(row=>println(row))