# Chapter11.SparkML

### 示例：垃圾邮件分类

In [1]:
from pyspark import SparkContext
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.classification import LogisticRegressionWithSGD
import os
sc = SparkContext('local[*]', 'SparkML')
parent_file = os.path.abspath('../')
spam = sc.textFile('file://{}/files/spam.txt'.format(parent_file))
normal = sc.textFile('file://{}/files/ham.txt'.format(parent_file))

In [2]:
# 创建一个HashingTF实例来把文本映射为包含10000个特征的向量
tf = HashingTF(numFeatures=10000)
# 各邮件都被切分为单词，每个单词都被映射为一个特征
spamFeatures = spam.map(lambda email: tf.transform(email.split(' ')))
print(spamFeatures.collect())
normalFeatures = normal.map(lambda email: tf.transform(email.split(' ')))
print(normalFeatures.collect())

[SparseVector(10000, {0: 1.0, 365: 1.0, 455: 1.0, 509: 1.0, 1320: 1.0, 1363: 2.0, 1583: 1.0, 2321: 2.0, 2403: 1.0, 3289: 2.0, 3342: 1.0, 4995: 1.0, 5336: 1.0, 5706: 1.0, 5831: 1.0, 6052: 1.0, 6300: 1.0, 6582: 1.0, 6744: 1.0, 8971: 1.0, 8977: 1.0, 9232: 1.0, 9604: 1.0, 9646: 1.0, 9878: 1.0}), SparseVector(10000, {0: 1.0, 365: 1.0, 940: 1.0, 2220: 1.0, 3122: 1.0, 4460: 1.0, 4671: 1.0, 5336: 1.0, 5849: 1.0, 8479: 1.0, 9604: 1.0}), SparseVector(10000, {82: 1.0, 103: 1.0, 1091: 1.0, 1451: 1.0, 1819: 1.0, 2220: 1.0, 2321: 1.0, 2824: 1.0, 3317: 1.0, 3574: 1.0, 4373: 1.0, 4460: 1.0, 5763: 1.0, 6043: 1.0, 6052: 1.0, 6408: 1.0, 7119: 1.0, 7296: 1.0, 7656: 1.0, 9163: 1.0, 9399: 1.0, 9604: 1.0}), SparseVector(10000, {0: 2.0, 365: 1.0, 1683: 1.0, 2410: 1.0, 2634: 1.0, 4673: 1.0, 4837: 1.0, 5146: 1.0, 5172: 2.0, 5430: 1.0, 5880: 1.0, 6147: 1.0, 7094: 1.0, 7119: 1.0, 7566: 1.0, 7785: 1.0, 8242: 1.0, 9101: 1.0, 9241: 1.0, 9604: 1.0}), SparseVector(10000, {0: 1.0, 365: 2.0, 1395: 1.0, 1451: 1.0, 1458: 

In [3]:
# 创建LabeledPoint数据集分别存放 垃圾邮件 和 正常邮件 的例子
postiveExamlles = spamFeatures.map(lambda features: LabeledPoint(0, features))
negativeExamples = normalFeatures.map(lambda features: LabeledPoint(1, features))
trainingData = postiveExamlles.union(negativeExamples)
trainingData.cache()  # 因为逻辑回归是迭代算法，所以缓存训练数据RDD

UnionRDD[8] at union at NativeMethodAccessorImpl.java:0

In [4]:
# 使用SGD算法运行逻辑回归
model = LogisticRegressionWithSGD.train(trainingData)

In [11]:
# 以 垃圾邮件 和 正常邮件 的例子分别进行测试
# 首先使用一样的HashingTF特征来得到特征向量，然后对该向量应用得到的模型
postTest = tf.transform('O M G GET cheap stuff by sending money to'.split(' '))
negTest = tf.transform('Hi Dad, I started studying Spark the other'.split(' '))
print('Prediction for postive test example: %g' % model.predict(postTest))
print('Prediction for negative test example: %g' % model.predict(negTest))

Prediction for postive test example: 0
Prediction for negative test example: 0


### 4.数据类型
- 向量

In [13]:
from numpy import array
from pyspark.mllib.linalg import Vectors

In [14]:
# 创建稠密向量 <1.0,2.0,3.0>
denseVec1 = array([1.0, 2.0, 3.0, 0.0])  # Numpy 数组可以直接传给Mllib
denseVec2 = Vectors.dense([1.0, 2.0, 3.0, 0.0])  # 或者使用Vectors类来创建
print(denseVec1)
print(denseVec2)

[1. 2. 3. 0.]
[1.0,2.0,3.0,0.0]


In [15]:
# 创建稀疏向量 该方法直接收向量的维度以及非0位的位置和对应的值
# 这些数据可以用一个dict来传递，或者使用两个分别代表位置和值的list
sparseVec1 = Vectors.sparse(4, {0: 1.0, 1: 2, 3: 0})
sparseVec2 = Vectors.sparse(4, [0, 2, 5], [1.0, 2.0, 0.0])
print(sparseVec1)
print(sparseVec2)

(4,[0,1,3],[1.0,2.0,0.0])
(4,[0,2,5],[1.0,2.0,0.0])


### 5.算法

In [16]:
# TF-IDF
from pyspark.mllib.feature import HashingTF, IDF

In [17]:
sentence = 'hello hello world'
rdd = sc.wholeTextFiles('file://{}/files/spam.txt'.format(parent_file))
words = sentence.split()
tf = HashingTF(10000)
tfVectors = tf.transform(rdd).cache()

In [18]:
# 计算IDF,然后计算TF-IDF向量
idf = IDF()
idfModel = idf.fit(tfVectors)
tfIdfVectors = idfModel.transform(tfVectors)
tfIdfVectors.collect()

[SparseVector(10000, {3861: 0.0, 7048: 0.0})]

#### 缩放
控制平均值 和 标准差

In [30]:
from pyspark.mllib.feature import StandardScaler,Vectors,Normalizer,Word2Vec
vectors = [Vectors.dense([1,2,3]),Vectors.dense([5,6,7])]
dataset = sc.parallelize(vectors)
scaler = StandardScaler(withMean=True,withStd=True)
model = scaler.fit(dataset)
result = model.transform(dataset)
result.collect()

[DenseVector([-0.7071, -0.7071, -0.7071]),
 DenseVector([0.7071, 0.7071, 0.7071])]

In [19]:
from pyspark.mllib.feature import StandardScaler, Vectors, Normalizer, Word2Vec

In [20]:
vectors = [Vectors.dense([1, 2, 3]), Vectors.dense([5, 6, 7])]
dataset = sc.parallelize(vectors)
scaler = StandardScaler(withMean=True, withStd=True)
model = scaler.fit(dataset)
result = model.transform(dataset)
result.collect()

[DenseVector([-0.7071, -0.7071, -0.7071]),
 DenseVector([0.7071, 0.7071, 0.7071])]

#### 正规化

In [27]:
Normalizer().transform(dataset).collect()

[DenseVector([0.2673, 0.5345, 0.8018]), DenseVector([0.4767, 0.5721, 0.6674])]

#### Word2Vec

In [30]:
sentence = "a b " * 100 + "a c " * 10
localDoc = [sentence, sentence]
doc = sc.parallelize(localDoc).map(lambda line: line.split(" "))
model = Word2Vec().setVectorSize(10).setSeed(42).fit(doc)

In [33]:
vec = model.transform("a")
vec

DenseVector([0.267, -0.2691, 0.058, -0.0801, 0.1821, 0.4162, 0.0259, -0.2163, 0.1787, 0.0764])

### 分类与回归
- 线性回归
- 逻辑回归
- 支持向量机
- 朴素贝叶斯
- 决策树与随机森林

In [35]:
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.regression import LinearRegressionWithSGD