In [96]:
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.regression import GBTRegressor

from pyspark.sql import SparkSession

import numpy as np

spark = SparkSession \
    .builder \
    .appName("Python") \
    .getOrCreate()

In [9]:
df = spark.read.format('com.databricks.spark.csv').\
                       options(header='true', \
                       inferschema='true').\
            load("data/Advertising.csv",header=True)\
            .select('TV', 'Radio', 'Newspaper', 'Sales');

In [13]:
df.registerTempTable("w")

In [17]:
spark.sql("select * from w").show(5)


+-----+-----+---------+-----+
|   TV|Radio|Newspaper|Sales|
+-----+-----+---------+-----+
|230.1| 37.8|     69.2| 22.1|
| 44.5| 39.3|     45.1| 10.4|
| 17.2| 45.9|     69.3|  9.3|
|151.5| 41.3|     58.5| 18.5|
|180.8| 10.8|     58.4| 12.9|
+-----+-----+---------+-----+
only showing top 5 rows



In [6]:
df.show(5,True)
df.printSchema()

+-----+-----+---------+-----+
|   TV|Radio|Newspaper|Sales|
+-----+-----+---------+-----+
|230.1| 37.8|     69.2| 22.1|
| 44.5| 39.3|     45.1| 10.4|
| 17.2| 45.9|     69.3|  9.3|
|151.5| 41.3|     58.5| 18.5|
|180.8| 10.8|     58.4| 12.9|
+-----+-----+---------+-----+
only showing top 5 rows

root
 |-- TV: double (nullable = true)
 |-- Radio: double (nullable = true)
 |-- Newspaper: double (nullable = true)
 |-- Sales: double (nullable = true)



In [7]:
df.describe().show()

+-------+-----------------+------------------+------------------+------------------+
|summary|               TV|             Radio|         Newspaper|             Sales|
+-------+-----------------+------------------+------------------+------------------+
|  count|              200|               200|               200|               200|
|   mean|         147.0425|23.264000000000024|30.553999999999995|14.022500000000003|
| stddev|85.85423631490805|14.846809176168728| 21.77862083852283| 5.217456565710477|
|    min|              0.7|               0.0|               0.3|               1.6|
|    max|            296.4|              49.6|             114.0|              27.0|
+-------+-----------------+------------------+------------------+------------------+



# Подготовим фичи

In [19]:
# данные в вектор
def transData(data):
    return data.rdd.map(lambda r: [Vectors.dense(r[:-1]),r[-1]]).toDF(['features','label'])

In [20]:
transformed= transData(df)
transformed.show(5)

+-----------------+-----+
|         features|label|
+-----------------+-----+
|[230.1,37.8,69.2]| 22.1|
| [44.5,39.3,45.1]| 10.4|
| [17.2,45.9,69.3]|  9.3|
|[151.5,41.3,58.5]| 18.5|
|[180.8,10.8,58.4]| 12.9|
+-----------------+-----+
only showing top 5 rows



In [23]:
# автоматическое определение категориальных переменных и их индексирование
# а как правильно обрабатывать категории для линейной регрессии?

featureIndexer = VectorIndexer(inputCol="features", \
                               outputCol="indexedFeatures",\
                               maxCategories=4).fit(transformed)

data = featureIndexer.transform(transformed)

In [24]:
data.show(5,True)

+-----------------+-----+-----------------+
|         features|label|  indexedFeatures|
+-----------------+-----+-----------------+
|[230.1,37.8,69.2]| 22.1|[230.1,37.8,69.2]|
| [44.5,39.3,45.1]| 10.4| [44.5,39.3,45.1]|
| [17.2,45.9,69.3]|  9.3| [17.2,45.9,69.3]|
|[151.5,41.3,58.5]| 18.5|[151.5,41.3,58.5]|
|[180.8,10.8,58.4]| 12.9|[180.8,10.8,58.4]|
+-----------------+-----+-----------------+
only showing top 5 rows



In [25]:
# train / test split
(trainingData, testData) = transformed.randomSplit([0.6, 0.4])

In [26]:
trainingData.show(5)
testData.show(5)

+---------------+-----+
|       features|label|
+---------------+-----+
| [4.1,11.6,5.7]|  3.2|
| [5.4,29.9,9.4]|  5.3|
|[7.3,28.1,41.4]|  5.5|
|[7.8,38.9,50.6]|  6.6|
| [8.4,27.2,2.1]|  5.7|
+---------------+-----+
only showing top 5 rows

+----------------+-----+
|        features|label|
+----------------+-----+
|  [0.7,39.6,8.7]|  1.6|
| [13.1,0.4,25.6]|  5.3|
|[16.9,43.7,89.4]|  8.7|
|[19.4,16.0,22.3]|  6.6|
|[19.6,20.1,17.0]|  7.6|
+----------------+-----+
only showing top 5 rows



# Простая модель

In [29]:
from pyspark.ml.regression import LinearRegression

# обычная регрессия (учится по методу наименьших квадратов)
# напомните, как она обучается?
lr = LinearRegression()

In [53]:
# можно использовать разные модели
# generalized linear models (GLM) - как она обучается

# from pyspark.ml.regression import GeneralizedLinearRegression

# glr = GeneralizedLinearRegression(family="gaussian", link="identity", maxIter=10, regParam=0.3)

In [78]:
# решающие деревья

# dt = DecisionTreeRegressor(featuresCol="indexedFeatures")

In [97]:
# Градиентный спуск

# rf = GBTRegressor() #numTrees=2, maxDepth=2, seed=42

In [106]:
# создание пайплайна
pipeline = Pipeline(stages=[featureIndexer, lr])  # glr, dt, rf

# обучим
model = pipeline.fit(trainingData)

In [107]:
def modelsummary(model):
    #print ("Note: последняя строчка - Intercept")
    print ("##","-------------------------------------------------")
    print ("##","  Estimate   |   Std.Error | t Values  |  P-value")
    coef = np.append(list(model.coefficients),model.intercept)
    Summary=model.summary

    for i in range(len(Summary.pValues)):
        print ("##",'{:10.6f}'.format(coef[i]),\
        '{:10.6f}'.format(Summary.coefficientStandardErrors[i]),\
        '{:8.3f}'.format(Summary.tValues[i]),\
        '{:10.6f}'.format(Summary.pValues[i]))

    print ("##",'---')
    print ("##","MSE: % .6f" \
          % Summary.meanSquaredError, ", RMSE: % .6f" \
          % Summary.rootMeanSquaredError )
    print ("##","R2: %f" % Summary.r2, ", \
           Total iterations: %i"% Summary.totalIterations)

In [108]:
modelsummary(model.stages[-1])

## -------------------------------------------------
##   Estimate   |   Std.Error | t Values  |  P-value
##   0.047177   0.001701   27.732   0.000000
##   0.189879   0.010926   17.378   0.000000
##   0.000496   0.007644    0.065   0.948328
##   2.680069   0.384556    6.969   0.000000
## ---
## MSE:  2.661049 , RMSE:  1.631272
## R2: 0.913149 ,            Total iterations: 1


In [109]:
# предикт
predictions = model.transform(testData)

In [110]:
#predictions

In [111]:
predictions.select("features","label","prediction").show(5)

+----------------+-----+------------------+
|        features|label|        prediction|
+----------------+-----+------------------+
|  [0.7,39.6,8.7]|  1.6|10.236619371205393|
| [13.1,0.4,25.6]|  5.3|3.3867459675520846|
|[16.9,43.7,89.4]|  8.7| 11.81944965295213|
|[19.4,16.0,22.3]|  6.6| 6.644434366594297|
|[19.6,20.1,17.0]|  7.6| 7.429742576698763|
+----------------+-----+------------------+
only showing top 5 rows



In [112]:
# оценка качества
evaluator = RegressionEvaluator(labelCol="label",
                                predictionCol="prediction",
                                metricName="rmse")

rmse = evaluator.evaluate(predictions)
print("RMSE на тесте = %g" % rmse)

RMSE на тесте = 1.73866


In [113]:
# оценка через sklearn
y_true = predictions.select("label").toPandas()
y_pred = predictions.select("prediction").toPandas()

import sklearn.metrics
r2_score = sklearn.metrics.r2_score(y_true, y_pred)
print('r2_score: {0}'.format(r2_score))

r2_score: 0.8577740123384001
