In [1]:
"""
共享单车数据集：
    http://archive.ics.uci.edu/ml/datasets/Bike+Sharing+Dataset
"""
print()




In [2]:
"""
	1- instant: record index
	2- dteday : date
	3- season : season (1:springer, 2:summer, 3:fall, 4:winter)
	4- yr : year (0: 2011, 1:2012)
	5- mnth : month ( 1 to 12)
	6- hr : hour (0 to 23)
	7- holiday : weather day is holiday or not (extracted from http://dchr.dc.gov/page/holiday-schedule)
	8- weekday : day of the week
	9- workingday : if day is neither weekend nor holiday is 1, otherwise is 0.
	10+ weathersit : 
		- 1: Clear, Few clouds, Partly cloudy, Partly cloudy
		- 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
		- 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
		- 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog
	11- temp : Normalized temperature in Celsius. The values are divided to 41 (max)
	12- atemp: Normalized feeling temperature in Celsius. The values are divided to 50 (max)
	13- hum: Normalized humidity. The values are divided to 100 (max)
	14- windspeed: Normalized wind speed. The values are divided to 67 (max)
	15- casual: count of casual users
	16- registered: count of registered users
	17- cnt: count of total rental bikes including both casual and registered
"""
print()




In [3]:
# 导入模块 pyspark
from pyspark import SparkConf, SparkContext
# 导入系统模块
import os
import time

In [4]:

# Create SparkConf
sparkConf = SparkConf()\
    .setAppName('Python_Spark_Regression')\
    .setMaster('local[*]')
# Create SparkContext
sc = SparkContext(conf=sparkConf)

In [5]:
sc

### 定义函数提取特征和标签

In [14]:
# 提取标签字段
def extract_label(record):
    label=(record[-1])
    return float(label)

In [15]:
# 定义 一个函数，转换数值类型
def convert_float(x):
    return 0 if x == "?" else float(x)

In [19]:
import numpy as np

# 定义函数提取特征features
def extract_features(record, end_index):
    # 获取季节字段特征值
    feature_serson = [convert_float(record[2])]
    
    # 从 5-month月份开始 字段特征到 风速14-windspeed 特征
    features = [convert_float(field) for field in record[4: end_index]]
    
    # 合并特征值
    return np.concatenate((feature_serson, features))

In [16]:
record = [u'1', u'2011-01-01', u'1', u'0', u'1', u'0', u'0', u'6', u'0', u'1', u'0.24', u'0.2879', u'0.81', u'0', u'3', u'13', u'16']

In [17]:
extract_label(record)

16.0

In [20]:
extract_features(record, 14)

array([1.    , 1.    , 0.    , 0.    , 6.    , 0.    , 1.    , 0.24  ,
       0.2879, 0.81  , 0.    ])

### 数据准备阶段


In [21]:
from pyspark.mllib.regression import LabeledPoint

# 预处理数据
def prepare_data(spark_contex):
    # ------------------------ 1. 导入数据并转换数据  ------------------------
    raw_data_with_header = spark_contex.textFile("./hour.csv")
    
    # 获取第一条数据
    header_data = raw_data_with_header.first()
    
    print(header_data)
    # 过滤掉第一条数据
    raw_data = raw_data_with_header.filter(lambda line: line != header_data)
    
    # 每行数据使用逗号分隔
    datas_rdd = raw_data.map(lambda line: line.split(','))
    # print(datas_rdd.first())
    # print("count = "+ str(datas_rdd.count()))
    
    # ------------------------ 2.导入数据并转换数建立模型训练所需数据RDD[LabeledPoint]  ------------------------
    lp_rdd = datas_rdd.map(lambda r: LabeledPoint(extract_label(r), extract_features(r, -3)))
    print(lp_rdd.first())
    
    # ------------------------ 3.以随机方式将数据集分为3个部分并且返回  ------------------------
    (train_data, validation_data, test_data) = lp_rdd.randomSplit([8, 1, 1])
    
    # 返回
    return train_data, validation_data, test_data

In [22]:
"""
数据预处理
"""
train_rdd, validation_rdd, test_rdd = prepare_data(sc)

instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
(16.0,[1.0,1.0,0.0,0.0,6.0,0.0,1.0,0.24,0.2879,0.81,0.0])


In [23]:
# 缓存数据
print("训练数据：" + str(train_rdd.persist().count()))
print("验证数据：" + str(validation_rdd.persist().count()))
print("测试数据：" + str(test_rdd.persist().count()))

训练数据：13920
验证数据：1712
测试数据：1747


In [24]:
train_rdd.first()

LabeledPoint(16.0, [1.0,1.0,0.0,0.0,6.0,0.0,1.0,0.24,0.2879,0.81,0.0])

## 训练评估阶段

### 定义模型评估函数

In [25]:
# 导入回归模型评估指标
from pyspark.mllib.evaluation import RegressionMetrics

# 评估模型函数
def evaluate_model(model, validation_datas):
    # 使用模型对验证数据集进行预测
    score = model.predict(validation_datas.map(lambda lp: lp.features))
    
    # 合并预测值与真实值
    score_and_labels = score.zip(validation_datas.map(lambda lp: lp.label))
    
    # Instantiate metrics object
    metrics = RegressionMetrics(score_and_labels)
    
    # 获取RMSE并返回
    return metrics.rootMeanSquaredError

### 训练模型及评估

In [26]:
# 使用训练数据集训练模型，使用算法：决策树算法
from pyspark.mllib.tree import DecisionTree

In [31]:
"""
def trainRegressor(cls, data, categoricalFeaturesInfo,
        impurity="variance", maxDepth=5, maxBins=32, minInstancesPerNode=1,
        minInfoGain=0.0)
"""
dtr_model = DecisionTree.trainRegressor(train_rdd, {}, maxDepth=10, maxBins=128)

In [32]:
# depth=10, bins=64
evaluate_model(dtr_model, validation_rdd)

81.39091853079493

### 指定决策树中类别特征

In [34]:
"""
类别特征：
	3- season : season (1:springer, 2:summer, 3:fall, 4:winter)
    1、2、3、4  ->  0、1、2、3
	5- mnth : month ( 1 to 12)
    
	6- hr : hour (0 to 23)
        不需要
	7- holiday : weather day is holiday or not (extracted from http://dchr.dc.gov/page/holiday-schedule)
        不需要
	8- weekday : day of the week
        一周的第几天
	9- workingday : if day is neither weekend nor holiday is 1, otherwise is 0.
        不需要
	10+ weathersit : 
		- 1: Clear, Few clouds, Partly cloudy, Partly cloudy
		- 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
		- 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
		- 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog
    
数值特征：归一化
	11- temp : Normalized temperature in Celsius. The values are divided to 41 (max)
	12- atemp: Normalized feeling temperature in Celsius. The values are divided to 50 (max)
	13- hum: Normalized humidity. The values are divided to 100 (max)
	14- windspeed: Normalized wind speed. The values are divided to 67 (max)
标签
	17- cnt: count of total rental bikes including both casual and registered
"""
print()




In [35]:
import numpy as np

# 定义函数提取特征features
def extract_features_catogery(record, end_index):
    # 获取季节字段特征值
    feature_serson = [convert_float(record[2]) - 1]
    # 获取月份字段特征 值
    feature_month = [convert_float(record[4]) - 1]
    # 获取weekday字段特征值
    feature_weekday = [convert_float(record[7])]
    # 获取weathersit字段特征值
    feature_weathersit = [convert_float(record[9]) - 1]
    
    # 其他类别特征
    feature_hr = [convert_float(record[5])]
    feature_holiday = [convert_float(record[6])]
    feature_workingday = [convert_float(record[8])]
    
    # 从 12-temp 字段特征到 风速14-windspeed 特征
    features = [convert_float(field) for field in record[10: end_index]]
    
    # 合并特征值   
    return np.concatenate((feature_serson, feature_month, feature_weekday, 
       feature_weathersit, feature_hr, feature_holiday, feature_workingday, features))

In [39]:
from pyspark.mllib.regression import LabeledPoint

# 预处理数据
def prepare_data_catogery(spark_contex):
    # ------------------------ 1. 导入数据并转换数据  ------------------------
    raw_data_with_header = spark_contex.textFile("./hour.csv")
    
    # 获取第一条数据
    header_data = raw_data_with_header.first()
    # print(header_data)
    # 过滤掉第一条数据
    raw_data = raw_data_with_header.filter(lambda line: line != header_data)
    
    # 每行数据使用逗号分隔
    datas_rdd = raw_data.map(lambda line: line.split(','))
    # print(datas_rdd.first())
    # print("count = "+ str(datas_rdd.count()))
    
    # ------------------------ 2.导入数据并转换数建立模型训练所需数据RDD[LabeledPoint]  ------------------------
    lp_rdd = datas_rdd.map(lambda r: LabeledPoint(extract_label(r), 
                                                  extract_features_catogery(r, -3)))
    # print(lp_rdd.first())
    
    # ------------------------ 3.以随机方式将数据集分为3个部分并且返回  ------------------------
    (train_data, validation_data, test_data) = lp_rdd.randomSplit([8, 1, 1])
    
    # 返回
    return train_data, validation_data, test_data

In [40]:
train_rdd2, validation_rdd2, test_rdd2 = prepare_data_catogery(sc)

In [42]:
# season
print(train_rdd2.map(lambda r: r.features[0]).distinct().collect())
# month
print(train_rdd2.map(lambda r: r.features[1]).distinct().collect()) 
# weekday
print(train_rdd2.map(lambda r: r.features[2]).distinct().collect())
# weathersit
print(train_rdd2.map(lambda r: r.features[3]).distinct().collect())
# hr
print(train_rdd2.map(lambda r: r.features[4]).distinct().collect())
# weathersit
print(train_rdd2.map(lambda r: r.features[5]).distinct().collect()) 
# workingday
print(train_rdd2.map(lambda r: r.features[6]).distinct().collect()) 

[0.0, 2.0, 1.0, 3.0]
[0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0]
[6.0, 0.0, 2.0, 4.0, 1.0, 3.0, 5.0]
[0.0, 2.0, 1.0, 3.0]
[0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0, 22.0, 1.0, 7.0, 11.0, 13.0, 15.0, 17.0, 21.0, 3.0, 9.0, 19.0, 5.0, 23.0]
[0.0, 1.0]
[0.0, 1.0]


In [43]:
"""
def trainRegressor(cls, data, categoricalFeaturesInfo,
        impurity="variance", maxDepth=5, maxBins=32, minInstancesPerNode=1,
        minInfoGain=0.0)

:param categoricalFeaturesInfo:
          Map storing arity of categorical features. An entry (n -> k)
          indicates that feature n is categorical with k categories
          indexed from 0: {0, 1, ..., k-1}.
"""
dtr_model2 = DecisionTree.trainRegressor(train_rdd2, 
                    {0: 4, 1: 12, 2: 7, 3: 4, 4: 24, 5: 2, 6: 2}, 
                     maxDepth=10, maxBins=64)

In [44]:
# 评估决策树回归模型
evaluate_model(dtr_model2, validation_rdd2)

77.3708003907519

### 随机森林回归算法

In [45]:
from pyspark.mllib.tree import RandomForest

rfr_model = RandomForest.trainRegressor(train_rdd2, 
                    {0: 4, 1: 12, 2: 7, 3: 4, 4: 24, 5: 2, 6: 2}, 20,
                     maxDepth=10, maxBins=64)

In [46]:
evaluate_model(rfr_model, validation_rdd2)

69.94814234623163

### GBT回归算法

In [47]:
from pyspark.mllib.tree import GradientBoostedTrees

In [48]:
gbtr_model = GradientBoostedTrees.trainRegressor(train_rdd2, 
                    {0: 4, 1: 12, 2: 7, 3: 4, 4: 24, 5: 2, 6: 2},
                     maxDepth=10, maxBins=64)

In [49]:
evaluate_model(gbtr_model, validation_rdd2)

73.59026688915507

## 线性回归算法训练模型

In [50]:
from pyspark.mllib.regression import LinearRegressionWithSGD

In [51]:
lr_model = LinearRegressionWithSGD.train(train_rdd2, iterations=100, step=1.0)

In [52]:
evaluate_model(lr_model, validation_rdd2)

Py4JJavaError: An error occurred while calling o843.rootMeanSquaredError.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 2663.0 failed 1 times, most recent failure: Lost task 0.0 in stage 2663.0 (TID 5206, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/Users/hui/Desktop/bigdata/spark-2.4.4-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 377, in main
    process()
  File "/Users/hui/Desktop/bigdata/spark-2.4.4-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 372, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/Users/hui/Desktop/bigdata/spark-2.4.4-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/serializers.py", line 393, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/Users/hui/Desktop/bigdata/spark-2.4.4-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/util.py", line 99, in wrapper
    return f(*args, **kwargs)
  File "/Users/hui/anaconda3/lib/python3.7/site-packages/pyspark/sql/session.py", line 730, in prepare
    verify_func(obj)
  File "/Users/hui/anaconda3/lib/python3.7/site-packages/pyspark/sql/types.py", line 1389, in verify
    verify_value(obj)
  File "/Users/hui/anaconda3/lib/python3.7/site-packages/pyspark/sql/types.py", line 1370, in verify_struct
    verifier(v)
  File "/Users/hui/anaconda3/lib/python3.7/site-packages/pyspark/sql/types.py", line 1389, in verify
    verify_value(obj)
  File "/Users/hui/anaconda3/lib/python3.7/site-packages/pyspark/sql/types.py", line 1383, in verify_default
    verify_acceptable_types(obj)
  File "/Users/hui/anaconda3/lib/python3.7/site-packages/pyspark/sql/types.py", line 1278, in verify_acceptable_types
    % (dataType, obj, type(obj))))
TypeError: field prediction: DoubleType can not accept object -1.8816326927189753e+155 in type <class 'numpy.float64'>

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:592)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:575)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$class.foreach(Iterator.scala:891)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1334)
	at scala.collection.TraversableOnce$class.foldLeft(TraversableOnce.scala:157)
	at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1334)
	at scala.collection.TraversableOnce$class.aggregate(TraversableOnce.scala:214)
	at scala.collection.AbstractIterator.aggregate(Iterator.scala:1334)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$24.apply(RDD.scala:1145)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$24.apply(RDD.scala:1145)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$25.apply(RDD.scala:1146)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$25.apply(RDD.scala:1146)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:801)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:801)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1889)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1876)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2158)
	at org.apache.spark.rdd.RDD$$anonfun$fold$1.apply(RDD.scala:1098)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.fold(RDD.scala:1092)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1.apply(RDD.scala:1161)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.treeAggregate(RDD.scala:1137)
	at org.apache.spark.mllib.evaluation.RegressionMetrics.summary$lzycompute(RegressionMetrics.scala:57)
	at org.apache.spark.mllib.evaluation.RegressionMetrics.summary(RegressionMetrics.scala:54)
	at org.apache.spark.mllib.evaluation.RegressionMetrics.SSerr$lzycompute(RegressionMetrics.scala:65)
	at org.apache.spark.mllib.evaluation.RegressionMetrics.SSerr(RegressionMetrics.scala:65)
	at org.apache.spark.mllib.evaluation.RegressionMetrics.meanSquaredError(RegressionMetrics.scala:100)
	at org.apache.spark.mllib.evaluation.RegressionMetrics.rootMeanSquaredError(RegressionMetrics.scala:109)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/Users/hui/Desktop/bigdata/spark-2.4.4-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 377, in main
    process()
  File "/Users/hui/Desktop/bigdata/spark-2.4.4-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 372, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/Users/hui/Desktop/bigdata/spark-2.4.4-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/serializers.py", line 393, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/Users/hui/Desktop/bigdata/spark-2.4.4-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/util.py", line 99, in wrapper
    return f(*args, **kwargs)
  File "/Users/hui/anaconda3/lib/python3.7/site-packages/pyspark/sql/session.py", line 730, in prepare
    verify_func(obj)
  File "/Users/hui/anaconda3/lib/python3.7/site-packages/pyspark/sql/types.py", line 1389, in verify
    verify_value(obj)
  File "/Users/hui/anaconda3/lib/python3.7/site-packages/pyspark/sql/types.py", line 1370, in verify_struct
    verifier(v)
  File "/Users/hui/anaconda3/lib/python3.7/site-packages/pyspark/sql/types.py", line 1389, in verify
    verify_value(obj)
  File "/Users/hui/anaconda3/lib/python3.7/site-packages/pyspark/sql/types.py", line 1383, in verify_default
    verify_acceptable_types(obj)
  File "/Users/hui/anaconda3/lib/python3.7/site-packages/pyspark/sql/types.py", line 1278, in verify_acceptable_types
    % (dataType, obj, type(obj))))
TypeError: field prediction: DoubleType can not accept object -1.8816326927189753e+155 in type <class 'numpy.float64'>

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:592)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:575)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$class.foreach(Iterator.scala:891)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1334)
	at scala.collection.TraversableOnce$class.foldLeft(TraversableOnce.scala:157)
	at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1334)
	at scala.collection.TraversableOnce$class.aggregate(TraversableOnce.scala:214)
	at scala.collection.AbstractIterator.aggregate(Iterator.scala:1334)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$24.apply(RDD.scala:1145)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$24.apply(RDD.scala:1145)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$25.apply(RDD.scala:1146)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$25.apply(RDD.scala:1146)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:801)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:801)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
