In [1]:
import os
import sys
os.environ["PYSPARK_SUBMIT_ARGS"]='--conf spark.sql.catalogImplementation=in-memory pyspark-shell'
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.3
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
spark

In [3]:
import pandas as pd

In [4]:
pdf = pd.read_csv("diamonds.csv", header=0).iloc[:,1:]

In [5]:
pdf.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


## Feature engineering

In [6]:
pdf['cut'] = pdf['cut'].replace({'Fair': 0, 'Good': 1, 'Very Good': 2, 'Premium': 3, 'Ideal': 4})
pdf['color'] = pdf['color'].replace({'J': 0, 'I': 1, 'H': 2, 'G': 3, 'F': 4, 'E': 5, 'D': 6})
pdf['clarity'] = pdf['clarity'].replace({'I1': 0, 'SI1': 1, 'SI2': 2, 'VS1': 3, 'VS2': 4, 'VVS1': 5, 'VVS2': 6, 'IF': 7})
pdf.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,4,5,2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,3,5,1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,1,5,3,56.9,65.0,327,4.05,4.07,2.31
3,0.29,3,1,4,62.4,58.0,334,4.2,4.23,2.63
4,0.31,1,0,2,63.3,58.0,335,4.34,4.35,2.75


In [7]:
# Split data into a labels dataframe and a features dataframe
labels = pdf['price'].values
featureNames = ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z']
features = pdf[featureNames].values

In [8]:
# Normalize features (columns) to have unit variance
from sklearn.preprocessing import normalize
features = normalize(features, axis=0)
features

array([[0.00106702, 0.00553547, 0.005655  , ..., 0.0029123 , 0.00293078,
        0.00289958],
       [0.00097424, 0.0041516 , 0.005655  , ..., 0.00286806, 0.00282769,
        0.00275639],
       [0.00106702, 0.00138387, 0.005655  , ..., 0.00298603, 0.00299705,
        0.00275639],
       ...,
       [0.00324745, 0.00276773, 0.006786  , ..., 0.00417307, 0.00418262,
        0.00424794],
       [0.00398973, 0.0041516 , 0.002262  , ..., 0.00453434, 0.00450662,
        0.00446272],
       [0.00347941, 0.00553547, 0.006786  , ..., 0.0042984 , 0.00432253,
        0.0043434 ]])

## Parameter tuning using Spark
Parameter tuning is the task of tuning (hyper)parameters of a learning or prediction system in order to improve the results. It is commonly done by training multiple models (each using different parameters) on one set of data and then testing those models on another held-out set of data (and maybe repeating). By testing on a held-out set not seen during training, we can tune the parameters in a data-driven way while limiting the risk of overfitting.

In this section, we will use k-fold cross validation, which works as follows:

Randomly split the data into k equal-sized subsets ("folds").
For i = 1, 2, ..., k,
Hold out fold i as a validation set.
Create a training set by combining all folds except for i.
For each set of parameters,
Train a model with that set of parameters.
Test the model on the validation set to compute a validation error.
For each set of parameters,
Compute the average validation error (averaging over the k models for this set of parameters).
Choose the best set of parameters, based on the average validation error.
Re-train on the entire dataset, using this best set of parameters.
Note that for each (fold, parameter set) pair, the task of training a model can be done independently of other folds and parameter sets. We will parallelize these tasks: scikit-learn will be used on each worker to do the training. This parallelization is especially helpful since training is the most computationally costly part of this workflow. If you use k folds of cross validation to test P different parameter settings, then distributing the task to train 1 model per worker can make it run close to k*P times faster!

We will also hold out some additional data for testing. We will use it to demonstrate the worth of careful parameter tuning by comparing:

Our initial model (with poorly chosen parameters)
The final model (with carefully tuned parameters)

### Hold out a random test set
We hold out a random sample of the data for testing. Note that this randomness can cause this notebook to produce different results each time it is run.

In [9]:
import sklearn

In [10]:
# Hold out 30% of the data for testing.  We will use the rest for training.
from sklearn.model_selection import train_test_split

trainingLabels, testLabels, trainingFeatures, testFeatures = train_test_split(labels, features, test_size=0.3)
ntrain, ntest = len(trainingLabels), len(testLabels)
print('Split data randomly into 2 sets: %d training and %d test instances.' % (ntrain, ntest))

Split data randomly into 2 sets: 37758 training and 16182 test instances.


### Split data and define tasks to distribute
Each distributed task will be a (fold, parameter set) pair. It will correspond to 1 model we train.

In [11]:
# We use scikit-learn's cross_validation module, which helps split our data randomly into k equal-size parts ("folds").
from sklearn.model_selection import KFold
numFolds = 3 # You may want to use more (10 or so) in practice
kf = KFold(n_splits=numFolds)

In [12]:
# "alphas" is a list of hyperparameter values to test
alphas = [0.0, 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
# Create a list of tasks to distribute
tasks = []
for alpha in alphas:
    for fold in range(numFolds):
        tasks = tasks + [(alpha, fold)]

In [13]:
tasks

[(0.0, 0),
 (0.0, 1),
 (0.0, 2),
 (0.0001, 0),
 (0.0001, 1),
 (0.0001, 2),
 (0.001, 0),
 (0.001, 1),
 (0.001, 2),
 (0.01, 0),
 (0.01, 1),
 (0.01, 2),
 (0.1, 0),
 (0.1, 1),
 (0.1, 2),
 (1.0, 0),
 (1.0, 1),
 (1.0, 2),
 (10.0, 0),
 (10.0, 1),
 (10.0, 2),
 (100.0, 0),
 (100.0, 1),
 (100.0, 2),
 (1000.0, 0),
 (1000.0, 1),
 (1000.0, 2)]

In [15]:
len(tasks)

27

In [16]:
# Create an RDD of tasks.  We set the number of partitions equal to the number of tasks to ensure maximum parallelism.
tasksRDD = sc.parallelize(tasks, numSlices = len(tasks))

### Broadcast dataset

In [17]:
trainingFeaturesBroadcast = sc.broadcast(trainingFeatures)
trainingLabelsBroadcast = sc.broadcast(trainingLabels)

### Run cross-validation in parallel
We define a function which will run on each worker. This function takes 1 task (1 hyperparameter alpha value + 1 fold index) and trains the corresponding model. We then use RDD.map to run these tasks in parallel.

In [18]:
from sklearn import linear_model

def trainOneModel(alpha, fold):
    """
    Given 1 task (1 hyperparameter alpha value + 1 fold index), train the corresponding model.
    Return: model, score on the fold's test data, task info.
    """
    localTrainingFeatures = trainingFeaturesBroadcast.value
    localTrainingLabels = trainingLabelsBroadcast.value
    # Extract indices for this fold
    trainIndex, valIndex = [], []
    fold_ = 0 # index into folds 'kf'
    for trainIndex_, valIndex_ in kf.split(localTrainingFeatures):
        if fold_ == fold:
            trainIndex, valIndex = trainIndex_, valIndex_
            break
        fold_ += 1
    # Get training data from the broadcast variables
    X_train, X_val = localTrainingFeatures[trainIndex], localTrainingFeatures[valIndex]
    Y_train, Y_val = localTrainingLabels[trainIndex], localTrainingLabels[valIndex]
    # Train the model, and score it
    clf = linear_model.Ridge(alpha=alpha)
    clf.fit(X_train, Y_train)
    score = clf.score(X_val, Y_val)
    return clf, score, alpha, fold

In [19]:
# LEARN!  We now map our tasks RDD and apply the training function to each task.
# After we call an action ("count") on the results, the actual training is executed.
trainedModelAndScores = tasksRDD.map(lambda alpha_fold: trainOneModel(alpha_fold[0], alpha_fold[1]))
trainedModelAndScores.cache()
trainedModelAndScores.count()

27

In [20]:
# Since we are done with our broadcast variables, we can clean them up.
# (This will happen automatically, but we can make it happen earlier by explicitly unpersisting the broadcast variables.
trainingFeaturesBroadcast.unpersist()
trainingLabelsBroadcast.unpersist()

### Collect results to get the best hyperparameter alpha

In [21]:
# Collect the results.
allScores = trainedModelAndScores.map(lambda x: (x[1], x[2], x[3])).collect()
# Average scores over folds
avgScores = dict(map(lambda alpha: (alpha, 0.0), alphas))

for score, alpha, fold in allScores:
    avgScores[alpha] += score
for alpha in alphas:
    avgScores[alpha] /= numFolds
avgScores

{0.0: 0.8800442053490981,
 0.0001: 0.8933448739675794,
 0.001: 0.8914187128811696,
 0.01: 0.877910881670763,
 0.1: 0.7294722202592405,
 1.0: 0.2278682262797775,
 10.0: 0.028369188141010304,
 100.0: 0.002825148486747938,
 1000.0: 0.00020004767570277102}

We now have a list of alpha values paired with the corresponding average scores (averaged over the k folds). Let's identify the best score to discover the best value for alpha.

In [22]:
# Find best score
bestAlpha = -1
bestScore = -1
for alpha in alphas:
    if avgScores[alpha] > bestScore:
        bestAlpha = alpha
        bestScore = avgScores[alpha]
print('Found best alpha: %g, which gives score: %g' % (bestAlpha, bestScore))

Found best alpha: 0.0001, which gives score: 0.893345


### Train a final model using the best hyperparameter
We use our chosen value of alpha to train a model on the entire training dataset. Since this is a single training task, we execute it on the driver.

### Train a final model using the best hyperparameter
We use our chosen value of alpha to train a model on the entire training dataset. Since this is a single training task, we execute it on the driver.

In [23]:
# Use bestAlpha, and train a final model.
tunedClf = linear_model.Ridge(alpha=bestAlpha)
tunedClf.fit(trainingFeatures, trainingLabels)

Ridge(alpha=0.0001, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

### Spark-sklearn
https://github.com/databricks/spark-sklearn

In [43]:
import sys

In [44]:
sys.path.insert(0, "/data/home/pavel.klemenkov/.local/lib/python3.5/site-packages/")

In [45]:
from spark_sklearn import GridSearchCV

In [46]:
parameters = {"alpha": alphas}
parameters

{'alpha': [0.0, 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]}

In [47]:
est = linear_model.Ridge()

In [49]:
clf = GridSearchCV(sc, est, parameters, n_jobs=4)

TypeError: __init__() got an unexpected keyword argument 'fit_params'

In [32]:
clf.fit(trainingFeatures, trainingLabels)

GridSearchCV(cv=3, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params={}, iid=True, n_jobs=4,
       param_grid={'alpha': [0.0, 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       sc=<SparkContext master=yarn appName=pyspark-shell>, scoring=None,
       verbose=0)

In [33]:
clf.best_estimator_

Ridge(alpha=0.0001, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [34]:
clf.cv_results_

{'split0_test_score': array([8.95047858e-01, 8.94854285e-01, 8.92274790e-01, 8.78672516e-01,
        7.27236876e-01, 2.26493145e-01, 2.81950110e-02, 2.81360204e-03,
        2.05283090e-04]),
 'split1_test_score': array([8.94108059e-01, 8.92793011e-01, 8.91421276e-01, 8.79374574e-01,
        7.35250992e-01, 2.31639931e-01, 2.89400959e-02, 2.93051613e-03,
        2.56946120e-04]),
 'split2_test_score': array([8.33555515e-01, 8.94988347e-01, 8.92123425e-01, 8.77191472e-01,
        7.30388912e-01, 2.29404205e-01, 2.86796868e-02, 2.93253422e-03,
        2.85999026e-04]),
 'mean_test_score': array([8.74237144e-01, 8.94211881e-01, 8.91939830e-01, 8.78412854e-01,
        7.30958927e-01, 2.29179094e-01, 2.86049312e-02, 2.89221747e-03,
        2.49409412e-04]),
 'std_test_score': array([2.87688141e-02, 1.00478441e-03, 3.71843867e-04, 9.09964217e-04,
        3.29648330e-03, 2.10718743e-03, 3.08738484e-04, 5.55956028e-05,
        3.33803044e-05]),
 'rank_test_score': array([4, 1, 2, 3, 5, 6, 7, 8,

## Model conversion

In [35]:
from spark_sklearn import Converter

In [36]:
converter = Converter(sc)

In [37]:
est = linear_model.LinearRegression()

In [38]:
est.fit(trainingFeatures, trainingLabels)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [39]:
spark_est = converter.toSpark(est)

In [41]:
type(spark_est)

pyspark.ml.regression.LinearRegressionModel

In [42]:
spark_est.coefficients

DenseVector([2350317.6733, 95198.8782, 257096.4591, 300959.7313, -1377160.0658, -467172.5487, -1454856.1685, 59933.2479, -674.9436])

In [43]:
spark_est.intercept

6572.206514350702

In [44]:
from pyspark.sql.types import *
from pyspark.ml.linalg import DenseVector, VectorUDT

In [45]:
schema = StructType(fields=[
    StructField("features", VectorUDT()),
    StructField("labels", IntegerType())
])

In [46]:
test_df = spark.createDataFrame(zip(map(DenseVector, testFeatures), map(int, testLabels)), schema=schema)

In [47]:
test_df.show()

+--------------------+------+
|            features|labels|
+--------------------+------+
|[0.00723717592604...|  9788|
|[0.00143815675453...|   544|
|[0.00250517628209...|  1619|
|[0.00468560749057...|  6670|
|[0.00468560749057...|  4989|
|[0.00324745073604...|  1715|
|[0.00236599982197...|  1656|
|[0.00593819563162...| 12061|
|[0.00380415657651...|  2741|
|[0.00468560749057...|  4416|
|[0.00115980383430...|   445|
|[0.01029905804860...| 13703|
|[0.00259796058883...|  1286|
|[0.00143815675453...|   757|
|[0.00491756825744...|  3936|
|[0.00324745073604...|  2676|
|[0.00422168595685...|  4770|
|[0.00932482282778...| 17220|
|[0.00194847044162...|  1179|
|[0.00282992135569...|  2575|
+--------------------+------+
only showing top 20 rows



In [48]:
spark_est.transform(test_df)

Py4JJavaError: An error occurred while calling o156.transform.
: java.util.NoSuchElementException: Failed to find a default value for loss
	at org.apache.spark.ml.param.Params$$anonfun$getOrDefault$2.apply(params.scala:780)
	at org.apache.spark.ml.param.Params$$anonfun$getOrDefault$2.apply(params.scala:780)
	at scala.Option.getOrElse(Option.scala:121)
	at org.apache.spark.ml.param.Params$class.getOrDefault(params.scala:779)
	at org.apache.spark.ml.PipelineStage.getOrDefault(Pipeline.scala:42)
	at org.apache.spark.ml.param.Params$class.$(params.scala:786)
	at org.apache.spark.ml.PipelineStage.$(Pipeline.scala:42)
	at org.apache.spark.ml.regression.LinearRegressionParams$class.validateAndTransformSchema(LinearRegression.scala:110)
	at org.apache.spark.ml.regression.LinearRegressionModel.validateAndTransformSchema(LinearRegression.scala:640)
	at org.apache.spark.ml.PredictionModel.transformSchema(Predictor.scala:192)
	at org.apache.spark.ml.PipelineStage.transformSchema(Pipeline.scala:74)
	at org.apache.spark.ml.PredictionModel.transform(Predictor.scala:203)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:745)


In [24]:
spark.stop()