## 使用Spark Mlib使用分类算法

In [40]:
"""
    1、Kaggle 网站上一个竞赛：StumbleUpon Evergreen Classification Challenge
        https://www.kaggle.com/c/stumbleupon
    2、StumbleUpon是个性化的搜索引擎，会按用户的兴趣和网页评分等记录推荐给你感兴趣的网页，例如新文章、季节菜单、
    新闻、教学等。超过数千万人使用StumbleUpon查找新网页、图片、影片.....
    3、业务说明：
        -a. 有些网页内容是暂时性的（ephemeral），例如季节菜单、当日股市涨跌新闻等。
            这些文章可能只是某一段时间会对读者有意义，过了这段时间对读者就没有意义了。
        -b. 有些网页内容是长青的（evergreen），例如理财观念、育儿知识等。
            读者会长久对这些文章感兴趣。
        -c. 分辨网页是暂时性（ephemeral）还是长青的（evergreen），对于StumbleUpon推荐网页给用户会有很大帮助。
            例如 A 买卖股票，Ta可能对当日股市涨跌新闻感兴趣，可是过了一周就对这则新闻就没有兴趣了；如果是理财
            观念的文章，读者A可能会对会长久感兴趣。
        -d. 因此公司找来大数据分析师，负责“网页分类”大数据项目。
            网页内容我们人类看过了，就可以大致分为暂时性的或是长青的，这就是历史数据。
        -e. 目标就是利用机器学习（Machine Learning），通过大量网页数据进行训练来建立一个模型，并使用这个模型去
            预测网页是属于 暂时性的 还是 长青的内容，此属于二元分类问题。
        -f. 分类常见算法：
            - 决策树分类
            - 逻辑回归分类
            - 支持向量机分类
            - 朴素贝叶斯分类
        
"""
print()




In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("SparkSessionMoive").master("local[*]").getOrCreate()


In [2]:
# 获取SparkContext 实例对象
sc = spark.sparkContext
sc

## 读取数据

In [3]:
raw_rdd =sc.textFile("./StumbleUpon/train.tsv")

In [4]:
raw_rdd.count()

7396

In [5]:
raw_rdd.first()

'"url"\t"urlid"\t"boilerplate"\t"alchemy_category"\t"alchemy_category_score"\t"avglinksize"\t"commonlinkratio_1"\t"commonlinkratio_2"\t"commonlinkratio_3"\t"commonlinkratio_4"\t"compression_ratio"\t"embed_ratio"\t"framebased"\t"frameTagRatio"\t"hasDomainLink"\t"html_ratio"\t"image_ratio"\t"is_news"\t"lengthyLinkDomain"\t"linkwordscore"\t"news_front_page"\t"non_markup_alphanum_characters"\t"numberOfLinks"\t"numwords_in_url"\t"parametrizedLinkRatio"\t"spelling_errors_ratio"\t"label"'

In [6]:
"""
    数据集中字段的说明：
        -1. 字段： 0 - 2
            表示的是url网址、urlid网址ID、boilerplate样本文字(JSON格式数据)，忽略
        -2. 3 - 25
            23字段属于特征字段，基本上都是数值类型，内容是有关网页相关的信息，例如：网页的分类
            链接的数目，图像的比例等
        -3. 字段26
            属于标签label，具有两个值
            - 0：表示的是长青型（evergreen）- 此网页会持续让用户感兴趣
            - 1: 代表non-evergreen -此网页具有暂时性
"""
print()




In [12]:
print(raw_rdd.first().split("\t"))

['"url"', '"urlid"', '"boilerplate"', '"alchemy_category"', '"alchemy_category_score"', '"avglinksize"', '"commonlinkratio_1"', '"commonlinkratio_2"', '"commonlinkratio_3"', '"commonlinkratio_4"', '"compression_ratio"', '"embed_ratio"', '"framebased"', '"frameTagRatio"', '"hasDomainLink"', '"html_ratio"', '"image_ratio"', '"is_news"', '"lengthyLinkDomain"', '"linkwordscore"', '"news_front_page"', '"non_markup_alphanum_characters"', '"numberOfLinks"', '"numwords_in_url"', '"parametrizedLinkRatio"', '"spelling_errors_ratio"', '"label"']


In [13]:
# 获取第一条数据
header_data = raw_rdd.first()

In [14]:
# 采用过滤的方式删除第一条数据
 filter_rdd = raw_rdd.filter(lambda line: line != header_data)

In [15]:
 filter_rdd.first()

'"http://www.bloomberg.com/news/2010-12-23/ibm-predicts-holographic-calls-air-breathing-batteries-by-2015.html"\t"4042"\t"{""title"":""IBM Sees Holographic Calls Air Breathing Batteries ibm sees holographic calls, air-breathing batteries"",""body"":""A sign stands outside the International Business Machines Corp IBM Almaden Research Center campus in San Jose California Photographer Tony Avelar Bloomberg Buildings stand at the International Business Machines Corp IBM Almaden Research Center campus in the Santa Teresa Hills of San Jose California Photographer Tony Avelar Bloomberg By 2015 your mobile phone will project a 3 D image of anyone who calls and your laptop will be powered by kinetic energy At least that s what International Business Machines Corp sees in its crystal ball The predictions are part of an annual tradition for the Armonk New York based company which surveys its 3 000 researchers to find five ideas expected to take root in the next five years IBM the world s largest 

In [16]:
# 每天数据各个字段值，使用双引号引起来，进行替换

datas = filter_rdd.map(lambda  line: line.replace("\"", "")).map(lambda line: line.split("\t"))

In [17]:
datas.take(2)

[['http://www.bloomberg.com/news/2010-12-23/ibm-predicts-holographic-calls-air-breathing-batteries-by-2015.html',
  '4042',
  '{title:IBM Sees Holographic Calls Air Breathing Batteries ibm sees holographic calls, air-breathing batteries,body:A sign stands outside the International Business Machines Corp IBM Almaden Research Center campus in San Jose California Photographer Tony Avelar Bloomberg Buildings stand at the International Business Machines Corp IBM Almaden Research Center campus in the Santa Teresa Hills of San Jose California Photographer Tony Avelar Bloomberg By 2015 your mobile phone will project a 3 D image of anyone who calls and your laptop will be powered by kinetic energy At least that s what International Business Machines Corp sees in its crystal ball The predictions are part of an annual tradition for the Armonk New York based company which surveys its 3 000 researchers to find five ideas expected to take root in the next five years IBM the world s largest provider 

In [18]:
# 查看数据条目数
print(datas.count())

7395


In [19]:
print(datas.first()[3:])

['business', '0.789131', '2.055555556', '0.676470588', '0.205882353', '0.047058824', '0.023529412', '0.443783175', '0', '0', '0.09077381', '0', '0.245831182', '0.003883495', '1', '1', '24', '0', '5424', '170', '8', '0.152941176', '0.079129575', '0']


### 提取特征features特征字段

In [20]:
# 提取类别特征字段值，使用1-of-K 编码方式转换（OneHotEncoder编码）

"""
1. 创建字典category_dic,一个类别对应一个数字(可以使用索引表示)
"""

# 构建网页分类的索引
category_dic = datas.map(lambda fields: fields[3]).distinct().zipWithIndex().collectAsMap()


In [21]:
category_dic

{'business': 0,
 'sports': 1,
 '?': 2,
 'arts_entertainment': 3,
 'gaming': 4,
 'culture_politics': 5,
 'computer_internet': 6,
 'law_crime': 7,
 'religion': 8,
 'weather': 9,
 'unknown': 10,
 'recreation': 11,
 'health': 12,
 'science_technology': 13}

In [22]:
# 定义第一个函数：将 ? 值进行转换为 0
def convert_float(x):
    return (0 if x == "?" else float(x))

In [23]:
convert_float("34")

34.0

In [27]:
import numpy as np
# 定义第二个函数：提取特征值
def extract_features(fields, cate_dic, end_index):
    # 分类字段
    category_index = cate_dic[fields[3]]
    category_features = np.zeros(len(cate_dic))
    category_features[category_index] = 1.0 
    
    # 数值字段
    numerical_features = [ convert_float(column) for column in fields[4: end_index] ]
    
    # 返回 “分类特征” +  “数值特征”
    return np.concatenate((category_features, numerical_features))

In [28]:
# 测试
extract_features(datas.first(), category_dic, 24)

array([1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 7.89131000e-01, 2.05555556e+00,
       6.76470588e-01, 2.05882353e-01, 4.70588240e-02, 2.35294120e-02,
       4.43783175e-01, 0.00000000e+00, 0.00000000e+00, 9.07738100e-02,
       0.00000000e+00, 2.45831182e-01, 3.88349500e-03, 1.00000000e+00,
       1.00000000e+00, 2.40000000e+01, 0.00000000e+00, 5.42400000e+03,
       1.70000000e+02, 8.00000000e+00])

In [29]:
# 定义第三个函数: 提取标签label
def extract_label(fields):
    return float(fields[-1])

In [30]:
extract_label(datas.first())

0.0

### 特征工程：构建分类算法特征数据RDD

In [31]:
from pyspark.mllib.regression import LabeledPoint

In [32]:
labelpoint_rdd = datas.map(lambda r: 
           LabeledPoint(extract_label(r), extract_features(r, category_dic, len(r) -1)))

In [33]:
labelpoint_rdd.take(5)

[LabeledPoint(0.0, [1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.789131,2.055555556,0.676470588,0.205882353,0.047058824,0.023529412,0.443783175,0.0,0.0,0.09077381,0.0,0.245831182,0.003883495,1.0,1.0,24.0,0.0,5424.0,170.0,8.0,0.152941176,0.079129575]),
 LabeledPoint(1.0, [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.574147,3.677966102,0.50802139,0.288770053,0.213903743,0.144385027,0.468648998,0.0,0.0,0.098707403,0.0,0.203489628,0.088652482,1.0,1.0,40.0,0.0,4973.0,187.0,9.0,0.181818182,0.125448029]),
 LabeledPoint(1.0, [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.996526,2.382882883,0.562015504,0.321705426,0.120155039,0.042635659,0.525448029,0.0,0.0,0.072447859,0.0,0.22640177,0.120535714,1.0,1.0,55.0,0.0,2240.0,258.0,11.0,0.166666667,0.057613169]),
 LabeledPoint(1.0, [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.801248,1.543103448,0.4,0.1,0.016666667,0.0,0.480724749,0.0,0.0,0.095860566,0.0,0.265655744,0.035343035,1.0,0.0,24.0,0.0,2737.0

### 数据集划分

In [34]:
# 划分数据集为：训练数据集、测试数据集、验证数据集，划分比例: 8-1-1
(train_rdd, validation_rdd, test_rdd) = labelpoint_rdd.randomSplit([8, 1, 1])

In [36]:
# 各个数据集的条目数
print("训练数据集: ", train_rdd.cache().count())
print("验证数据集: ", validation_rdd.cache().count())
print("测试数据集: ", test_rdd.cache().count())

训练数据集:  5887
验证数据集:  760
测试数据集:  748


### 如何训练模型

In [42]:
# 使用决策分类算法训练数据，获取模型
# 导入模块
from pyspark.mllib.tree import DecisionTree
"""
trainClassifier(cls, data, numClasses, categoricalFeaturesInfo,
        impurity='gini', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0)
"""
print()




In [43]:
dtc_model = DecisionTree.trainClassifier(train_rdd, 2, {}, impurity='gini', maxDepth=5, maxBins=16)

In [39]:
"""
        当我们使用历史数据执行训练时会建立决策树。可是决策树不可能无限成长，因此我们必须限制它的最大分支与深度，
    所以必须设置下列参数：
        -a. maxBins 参数：
            决策时每一个节点最大分支数
        -b. maxDepth 参数：
            决策树最大深度
        -c. Impurity 参数：
            决策树分裂节点时的方法，为什么选择特征进行分支
        当树的父节点在分裂子节点时，以什么方法作为依据？例如，湿度以60为分割点，分为大于60或小于60；或者湿度
    以50为分割点，分为大于50或小于50.到底哪种方式比较好呢？此时有Gini 与 Entropy 两种判断方式：
        -i. 基于系数（Gini）：
            由意大利统计学家Corrado Gini 发明，用于计算数值散步程度（Statistical Dispersion，统计离差）的指标。
        决策树算法对每种特征字段分割点计算估值，选择分裂后最小的基尼指数（Gini）方式。
        -ii. 熵（Entropy）:
            熵（Entropy）也被用于计算机系统混乱的程度。决策树算法对每种特征字段分割点计算估值，选择分裂后最小
        的熵（Entropy）方式。
"""
print()




In [44]:
dtc_model

DecisionTreeModel classifier of depth 5 with 49 nodes

### 评估模型


In [45]:
"""
    二分类模型评估：
        http://spark.apache.org/docs/2.2.0/mllib-evaluation-metrics.html#binary-classification
    重要的两个指标为：
        PR 面积和ROC面积，越接近1表示模型越好
"""
print()




In [46]:
from pyspark.mllib.tree import DecisionTreeModel
# 使用验证数据集 评估模型
score = dtc_model.predict(validation_rdd.map(lambda lp: lp.features))

In [47]:
score.take(10)

[0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0]

In [48]:
# 将预测与真实值进行join关联
score_and_label = score.zip(validation_rdd.map(lambda lp: lp.label))

In [49]:
score_and_label.take(10)

[(0.0, 0.0),
 (1.0, 1.0),
 (1.0, 1.0),
 (0.0, 0.0),
 (1.0, 1.0),
 (0.0, 0.0),
 (1.0, 1.0),
 (1.0, 1.0),
 (1.0, 0.0),
 (0.0, 0.0)]

In [50]:
# 使用BinaryClassificationMetrics计算PR面积和RUC面积
from pyspark.mllib.evaluation import BinaryClassificationMetrics

In [51]:
# Instantiate metrics object
metrics = BinaryClassificationMetrics(score_and_label)

In [52]:
print("Area under PR = %s" % metrics.areaUnderPR)

Area under PR = 0.6826370008274378


In [53]:
print("Area under ROC = %s" % metrics.areaUnderROC)

Area under ROC = 0.6667859222313417


In [59]:
# 定义一个函数，用于评估二分类模型
def model_evaluate(model, validation_datas):
    # 导入模块
    from pyspark.mllib.evaluation import BinaryClassificationMetrics
    # 使用模型预测
    predict_and_actual = model\
        .predict(validation_datas.map(lambda lp: lp.features))\
        .map(lambda pv: float(pv))\
        .zip(validation_datas.map(lambda lp: lp.label))
    # Instantiate metrics object
    binary_metrics = BinaryClassificationMetrics(predict_and_actual)
    # 以AUC面积为准
    return binary_metrics.areaUnderROC

### 使用集成学习算法训练模型及预测

#### 使用随机森林算法

In [54]:
# 导入模块
from pyspark.mllib.tree import RandomForest

In [55]:
"""
对于随机森林算法来说：适合于特征数目非常多数据，每次构建树的时候，选取一部分特征数据进行训练构建，所以得到的树是不一样的
def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, numTrees,
                        featureSubsetStrategy="auto", impurity="gini",
                        maxDepth=4, maxBins=32,
                        seed=None)
"""
print()




In [70]:
rfc_model = RandomForest.trainClassifier(train_rdd, 2, {}, 10, maxDepth=8, maxBins=32)

In [71]:
rfc_model

TreeEnsembleModel classifier with 10 trees

In [72]:
# 评估使用随机森林算法训练的模型
model_evaluate(rfc_model, validation_rdd)

0.6787851365476216

### 使用GBT梯度提升算法训练模型并评估

In [73]:
# 导入模块
from pyspark.mllib.tree import GradientBoostedTrees

In [75]:
"""
GBT 分类算法仅仅只能做二分类，多分类不行。
     Labels should take value {0, 1}.
trainClassifier(cls, data, categoricalFeaturesInfo, 
        loss='logLoss', numIterations=100, learningRate=0.1, maxDepth=5, maxBins=16) 
"""
print()




In [76]:
gbtc_model = GradientBoostedTrees.trainClassifier(train_rdd, {})

In [77]:
# 模型评估
model_evaluate(gbtc_model, validation_rdd)

0.6867016015320833

In [78]:
"""
布置作业：
    针对GBT算法来说，仿照 多层循环方式选取不同超参数的，训练模型，进行评估，获取最佳模型
    将最佳模型进行保存，并加载模型进行预测。
"""
print()




## 使用NaiveBayes训练模型

In [79]:
# 导入模块
from pyspark.mllib.classification import NaiveBayes

In [None]:
"""
针对NaiveBayes算来说，计算各个类别的概率，所以要求特征都是非负数
    Naive Bayes requires nonnegative feature values
"""
# 使用训练数据集训练模型
nb_model = NaiveBayes.train(train_rdd, lambda_=1.0)

## 使用SVM训练模型

In [82]:
from pyspark.mllib.classification import SVMWithSGD

In [83]:
svm_model = SVMWithSGD.train(train_rdd, iterations=100, step=1.0)

In [84]:
svm_model

(weights=[0.26353319934979674,-0.34762777140252743,-0.4669475256859845,-0.43742898211069237,-0.04297241718129342,-0.10600345765578302,-0.23917057777229325,-0.010578215716053618,-0.041369198354673085,-0.004628078590210272,-0.005518180959691204,0.3093129465275087,0.00038787400917408843,-0.09041627961412063,-0.5258171185708885,-2.7740019969230776,-0.42269384095922535,-0.14206956394524747,-0.017435700301149167,-0.0039555891937137645,-4.728929021814427,0.18733938272690526,0.0,-0.1329654552619478,-0.03247882037013842,-0.30432916365943974,-0.5883637607814826,-0.700432380943573,-0.6925673984816735,-63.63386046203523,-0.0629933424710449,-92.82066257838301,-91.4290043288529,-6.833812163649269,-0.18651057488581588,-0.16375868932303303], intercept=0.0)

In [87]:
# 评估模型
# 模型评估
model_evaluate(svm_model, validation_rdd)

0.5

## 使用LogisticRegression训练模型

In [88]:
from pyspark.mllib.classification import LogisticRegressionWithSGD

# 使用数据训练模型
lr_model = LogisticRegressionWithSGD.train(train_rdd, iterations=100, step=1.0)

In [93]:
test_rdd.map(lambda lp: lr_model.predict(lp.features)).take(10)
model_evaluate(lr_model, validation_rdd)

0.5

In [94]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
## 使用LR变形算法训练模型
lr2_model = LogisticRegressionWithLBFGS.train(train_rdd, iterations=100)

In [95]:
test_rdd.map(lambda lp: lr_model.predict(lp.features)).take(10)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [96]:
model_evaluate(lr_model, validation_rdd)

0.5