In [1]:
"""
共享单车数据集：
    http://archive.ics.uci.edu/ml/datasets/Bike+Sharing+Dataset
"""
print




In [2]:
"""
	1- instant: record index
	2- dteday : date
	3- season : season (1:springer, 2:summer, 3:fall, 4:winter)
	4- yr : year (0: 2011, 1:2012)
	5- mnth : month ( 1 to 12)
	6- hr : hour (0 to 23)
	7- holiday : weather day is holiday or not (extracted from http://dchr.dc.gov/page/holiday-schedule)
	8- weekday : day of the week
	9- workingday : if day is neither weekend nor holiday is 1, otherwise is 0.
	10+ weathersit : 
		- 1: Clear, Few clouds, Partly cloudy, Partly cloudy
		- 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
		- 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
		- 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog
	11- temp : Normalized temperature in Celsius. The values are divided to 41 (max)
	12- atemp: Normalized feeling temperature in Celsius. The values are divided to 50 (max)
	13- hum: Normalized humidity. The values are divided to 100 (max)
	14- windspeed: Normalized wind speed. The values are divided to 67 (max)
	15- casual: count of casual users
	16- registered: count of registered users
	17- cnt: count of total rental bikes including both casual and registered
"""
print




In [3]:
# 导入模块 pyspark
from pyspark import SparkConf, SparkContext
# 导入系统模块
import os
import time

In [4]:
# 设置环境变量
os.environ['JAVA_HOME'] = 'C:\Java\jdk1.8.0_91'
# HADOOP在Windows的兼容性问题  主要需要$HADOOP_HOME/lib中winutils.exe等文件
os.environ['HADOOP_HOME'] = 'C:\Java\hadoop-2.6.0-cdh5.7.6'
# 设置SPARK_HOME环境变量, 非常重要, 如果没有设置的话,SparkApplication运行不了
os.environ['SPARK_HOME'] = 'C:\Java\spark-2.2.0-bin-2.6.0-cdh5.7.6'

# Create SparkConf
sparkConf = SparkConf()\
    .setAppName('Python_Spark_Regression')\
    .setMaster('local[4]')
# Create SparkContext
sc = SparkContext(conf=sparkConf)

In [5]:
print(sc)

<SparkContext master=local[4] appName=Python_Spark_Regression>


### 定义函数提取特征和标签

In [23]:
# 提取标签字段
def extract_label(record):
    label=(record[-1])
    return float(label)

In [24]:
# 定义 一个函数，转换数值类型
def convert_float(x):
    return 0 if x == "?" else float(x)

In [133]:
import numpy as np

# 定义函数提取特征features
def extract_features(record, end_index):
    # 获取季节字段特征值
    feature_serson = [convert_float(record[2])]
    
    # 从 5-month月份开始 字段特征到 风速14-windspeed 特征
    features = [convert_float(field) for field in record[4: end_index]]
    
    # 合并特征值
    return np.concatenate((feature_serson, features))

In [27]:
record = [u'1', u'2011-01-01', u'1', u'0', u'1', u'0', u'0', u'6', u'0', u'1', u'0.24', u'0.2879', u'0.81', u'0', u'3', u'13', u'16']

In [28]:
extract_label(record)

16.0

In [29]:
extract_features(record, 14)

array([ 1.    ,  1.    ,  0.    ,  0.    ,  6.    ,  0.    ,  1.    ,
        0.24  ,  0.2879,  0.81  ,  0.    ])

### 数据准备阶段

In [134]:
from pyspark.mllib.regression import LabeledPoint

# 预处理数据
def prepare_data(spark_contex):
    # ------------------------ 1. 导入数据并转换数据  ------------------------
    raw_data_with_header = spark_contex.textFile("./datas/hour.csv")
    
    # 获取第一条数据
    header_data = raw_data_with_header.first()
    # print(header_data)
    # 过滤掉第一条数据
    raw_data = raw_data_with_header.filter(lambda line: line != header_data)
    
    # 每行数据使用逗号分隔
    datas_rdd = raw_data.map(lambda line: line.split(','))
    # print(datas_rdd.first())
    # print("count = "+ str(datas_rdd.count()))
    
    # ------------------------ 2.导入数据并转换数建立模型训练所需数据RDD[LabeledPoint]  ------------------------
    lp_rdd = datas_rdd.map(lambda r: LabeledPoint(extract_label(r), extract_features(r, -3)))
    # print(lp_rdd.first())
    
    # ------------------------ 3.以随机方式将数据集分为3个部分并且返回  ------------------------
    (train_data, validation_data, test_data) = lp_rdd.randomSplit([8, 1, 1])
    
    # 返回
    return train_data, validation_data, test_data

In [69]:
"""
数据预处理
"""
train_rdd, validation_rdd, test_rdd = prepare_data(sc)

In [39]:
# 缓存数据
print("训练数据：" + str(train_rdd.persist().count()))
print("验证数据：" + str(validation_rdd.persist().count()))
print("测试数据：" + str(test_rdd.persist().count()))

训练数据：13896
验证数据：1698
测试数据：1785


## 训练评估阶段

### 定义模型评估函数

In [42]:
# 导入回归模型评估指标
from pyspark.mllib.evaluation import RegressionMetrics

# 评估模型函数
def evaluate_model(model, validation_datas):
    # 使用模型对验证数据集进行预测
    score = model.predict(validation_datas.map(lambda lp: lp.features))
    
    # 合并预测值与真实值
    score_and_labels = score.zip(validation_datas.map(lambda lp: lp.label))
    
    # Instantiate metrics object
    metrics = RegressionMetrics(score_and_labels)
    
    # 获取RMSE并返回
    return metrics.rootMeanSquaredError

### 训练模型及评估

In [43]:
# 使用训练数据集训练模型，使用算法：决策树算法
from pyspark.mllib.tree import DecisionTree

In [51]:
"""
def trainRegressor(cls, data, categoricalFeaturesInfo,
        impurity="variance", maxDepth=5, maxBins=32, minInstancesPerNode=1,
        minInfoGain=0.0)
"""
dtr_model = DecisionTree.trainRegressor(train_rdd, {}, maxDepth=10, maxBins=64)

In [46]:
# depth=5, bins=32
evaluate_model(dtr_model, validation_rdd)

114.851108311838

In [48]:
# depth=10, bins=32
evaluate_model(dtr_model, validation_rdd)

78.49755588963191

In [50]:
# depth=10, bins=16
evaluate_model(dtr_model, validation_rdd)

87.54789040981262

In [52]:
# depth=10, bins=64
evaluate_model(dtr_model, validation_rdd)

78.9234729929176

### 指定决策树中类别特征

In [None]:
"""
类别特征：
	3- season : season (1:springer, 2:summer, 3:fall, 4:winter)
    1、2、3、4  ->  0、1、2、3
	5- mnth : month ( 1 to 12)
    
	6- hr : hour (0 to 23)
        不需要
	7- holiday : weather day is holiday or not (extracted from http://dchr.dc.gov/page/holiday-schedule)
        不需要
	8- weekday : day of the week
        一周的第几天
	9- workingday : if day is neither weekend nor holiday is 1, otherwise is 0.
        不需要
	10+ weathersit : 
		- 1: Clear, Few clouds, Partly cloudy, Partly cloudy
		- 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
		- 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
		- 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog
    
数值特征：归一化
	11- temp : Normalized temperature in Celsius. The values are divided to 41 (max)
	12- atemp: Normalized feeling temperature in Celsius. The values are divided to 50 (max)
	13- hum: Normalized humidity. The values are divided to 100 (max)
	14- windspeed: Normalized wind speed. The values are divided to 67 (max)
标签
	17- cnt: count of total rental bikes including both casual and registered
"""
print

In [114]:
import numpy as np

# 定义函数提取特征features
def extract_features_catogery(record, end_index):
    # 获取季节字段特征值
    feature_serson = [convert_float(record[2]) - 1]
    # 获取月份字段特征 值
    feature_month = [convert_float(record[4]) - 1]
    # 获取weekday字段特征值
    feature_weekday = [convert_float(record[7])]
    # 获取weathersit字段特征值
    feature_weathersit = [convert_float(record[9]) - 1]
    
    # 其他类别特征
    feature_hr = [convert_float(record[5])]
    feature_holiday = [convert_float(record[6])]
    feature_workingday = [convert_float(record[8])]
    
    # 从 12-temp 字段特征到 风速14-windspeed 特征
    features = [convert_float(field) for field in record[10: end_index]]
    
    # 合并特征值   
    return np.concatenate((feature_serson, feature_month, feature_weekday, 
       feature_weathersit, feature_hr, feature_holiday, feature_workingday, features))

In [115]:
from pyspark.mllib.regression import LabeledPoint

# 预处理数据
def prepare_data_catogery(spark_contex):
    # ------------------------ 1. 导入数据并转换数据  ------------------------
    raw_data_with_header = spark_contex.textFile("./datas/hour.csv")
    
    # 获取第一条数据
    header_data = raw_data_with_header.first()
    # print(header_data)
    # 过滤掉第一条数据
    raw_data = raw_data_with_header.filter(lambda line: line != header_data)
    
    # 每行数据使用逗号分隔
    datas_rdd = raw_data.map(lambda line: line.split(','))
    # print(datas_rdd.first())
    # print("count = "+ str(datas_rdd.count()))
    
    # ------------------------ 2.导入数据并转换数建立模型训练所需数据RDD[LabeledPoint]  ------------------------
    lp_rdd = datas_rdd.map(lambda r: LabeledPoint(extract_label(r), 
                                                  extract_features_catogery(r, -3)))
    # print(lp_rdd.first())
    
    # ------------------------ 3.以随机方式将数据集分为3个部分并且返回  ------------------------
    (train_data, validation_data, test_data) = lp_rdd.randomSplit([8, 1, 1])
    
    # 返回
    return train_data, validation_data, test_data

In [116]:
train_rdd2, validation_rdd2, test_rdd2 = prepare_data_catogery(sc)

In [117]:
# season
print train_rdd2.map(lambda r: r.features[0]).distinct().collect() 
# month
print train_rdd2.map(lambda r: r.features[1]).distinct().collect() 
# weekday
print train_rdd2.map(lambda r: r.features[2]).distinct().collect() 
# weathersit
print train_rdd2.map(lambda r: r.features[3]).distinct().collect() 
# hr
print train_rdd2.map(lambda r: r.features[4]).distinct().collect() 
# weathersit
print train_rdd2.map(lambda r: r.features[5]).distinct().collect() 
# workingday
print train_rdd2.map(lambda r: r.features[6]).distinct().collect() 

[0.0, 2.0, 1.0, 3.0]
[0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0]
[0.0, 2.0, 4.0, 6.0, 1.0, 3.0, 5.0]
[0.0, 2.0, 1.0, 3.0]
[0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0, 22.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0, 23.0]
[0.0, 1.0]
[0.0, 1.0]


In [118]:
train_rdd2.take(1)

[LabeledPoint(16.0, [0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.24,0.2879,0.81,0.0])]

In [121]:
"""
def trainRegressor(cls, data, categoricalFeaturesInfo,
        impurity="variance", maxDepth=5, maxBins=32, minInstancesPerNode=1,
        minInfoGain=0.0)

:param categoricalFeaturesInfo:
          Map storing arity of categorical features. An entry (n -> k)
          indicates that feature n is categorical with k categories
          indexed from 0: {0, 1, ..., k-1}.
"""
dtr_model2 = DecisionTree.trainRegressor(train_rdd2, 
                    {0: 4, 1: 12, 2: 7, 3: 4, 4: 24, 5: 2, 6: 2}, 
                     maxDepth=10, maxBins=64)

In [122]:
dtr_model2

DecisionTreeModel regressor of depth 10 with 1807 nodes

In [123]:
# 评估决策树回归模型
evaluate_model(dtr_model2, validation_rdd2)

81.35539570062727

### 随机森林回归算法

In [127]:
from pyspark.mllib.tree import RandomForest

rfr_model = RandomForest.trainRegressor(train_rdd2, 
                    {0: 4, 1: 12, 2: 7, 3: 4, 4: 24, 5: 2, 6: 2}, 20,
                     maxDepth=10, maxBins=64)

In [128]:
evaluate_model(rfr_model, validation_rdd2)

76.79614178638674

### GBT回归算法

In [129]:
from pyspark.mllib.tree import GradientBoostedTrees

In [130]:
gbtr_model = GradientBoostedTrees.trainRegressor(train_rdd2, 
                    {0: 4, 1: 12, 2: 7, 3: 4, 4: 24, 5: 2, 6: 2},
                     maxDepth=10, maxBins=64)

In [132]:
evaluate_model(gbtr_model, validation_rdd2)

76.9638339143844

## 线性回归算法训练模型

In [135]:
"""
数据预处理
"""
train_rdd, validation_rdd, test_rdd = prepare_data(sc)

In [136]:
# 缓存数据
print("训练数据：" + str(train_rdd.persist().count()))
print("验证数据：" + str(validation_rdd.persist().count()))
print("测试数据：" + str(test_rdd.persist().count()))

训练数据：13944
验证数据：1692
测试数据：1743


### LinearRegressionWithSGD 算法

In [137]:
from pyspark.mllib.regression import LinearRegressionWithSGD

In [139]:
lr_model = LinearRegressionWithSGD.train(train_rdd, iterations=100, step=1.0)

In [141]:
lr_model\
    .predict(validation_rdd.map(lambda lp: lp.features))\
    .zip(validation_rdd.map(lambda lp: lp.label))\
    .take(20)

[(-7.9378113396752435e+157, 40.0),
 (-2.0894303193384901e+158, 2.0),
 (-2.6074634455872108e+158, 8.0),
 (-2.3953373439593952e+157, 17.0),
 (-2.0508341123816214e+158, 1.0),
 (-2.5688829063424619e+158, 20.0),
 (-5.305656546015105e+157, 2.0),
 (-3.8984762626847825e+158, 77.0),
 (-4.4166316355918556e+158, 76.0),
 (-3.2565671349183053e+157, 5.0),
 (-1.3628082329851984e+158, 2.0),
 (-3.9551400651402161e+158, 63.0),
 (-4.2146881639332074e+158, 65.0),
 (-5.5088865772992951e+158, 54.0),
 (-5.7693344873912617e+158, 48.0),
 (-2.1931161333599686e+158, 88.0),
 (-3.2847618347502878e+158, 59.0),
 (-3.3637744658643496e+158, 67.0),
 (-4.6590220497750529e+158, 87.0),
 (-5.1935357154315489e+158, 55.0)]

In [143]:
"""
出现上述重要原因在于特征数据中存在大量的（7个特征）类别特征，未进行转换，应该使用1-of-K进行转换操作
"""
print


