In [4]:
%load_ext sparkmagic.magics

The sparkmagic.magics extension is already loaded. To reload it, use:
  %reload_ext sparkmagic.magics


Run the following cell to invoke the user interface for managing Spark. 

In [5]:
%spark add -s cpu_session -l python -u http://node03.conductor.iccmop:8993 -a u -k config
#{"conf": {"spark.default.parallelism":30,"spark.cores.max":30,
    #      "spark.ego.gpu.app": "false"}}

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
5,,pyspark,idle,,,✔


SparkSession available as 'spark'.


In [None]:
%%spark -s cpu_session

from __future__ import print_function
import os
import time
import argparse
import sys

## SPARK

from pyspark import SparkConf, SparkContext
from pyspark.sql.session import SparkSession

# data paths
# Please use the absolute path of the file if you wish to run the example on a distributed mode
data_path = '/shared/kelvin/snapml' 
filename = data_path + '/criteo.kaggle2014'
train_filename = filename + '-train.libsvm'
test_filename = filename + '-test.libsvm'

## snapML
os.environ["PYTHONPATH"] = '/opt/DL/snap-ml-spark/lib/' 
os.environ["SPARK_PYTHON_DIR"] = '/var/conductor/livy-integration/spark-2.3.1-hadoop-2.7/python'
sys.path.append('/opt/DL/snap-ml-spark/lib/')
sys.path.append('/usr/lib64/python2.7/site-packages')

n_features_ = 1000000
snapml_regularizer = 10.0

from pyspark.ml.classification import LogisticRegression as sparkml_LogisticRegression

#train_filename = "file://" + train_filename
train_filename = "file://" + test_filename
test_filename = "file://" + test_filename

# Load training data
train_data = spark.read.format("libsvm").option("numFeatures", str(n_features_)).load(train_filename)
test_data = spark.read.format("libsvm").option("numFeatures", str(n_features_)).load(test_filename)
n_examples = train_data.count()

# Create sparkML lib Logistic Regression
sparkml_lr = sparkml_LogisticRegression(fitIntercept=False, regParam=snapml_regularizer/n_examples, standardization=False)

# Fit the model and time it
sparkml_t0 = time.time()
sparkml_lr_model = sparkml_lr.fit(train_data)
sparkml_time = time.time() - sparkml_t0

# Perform inference on test data
predictions = sparkml_lr_model.transform(test_data)

# Show predictions against test labels
predictions.select("rawPrediction", "prediction", "label", "features").show(10)

# Compute accuracy
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
sparkml_accuracy = evaluator.evaluate(predictions)

# Print off Spark result

print('Spark ML', evaluator.getMetricName(),'=', sparkml_accuracy,", time: %.2f" % sparkml_time, 's')


In [None]:
%spark delete -s cpu_session

In [2]:
%spark add -s gpu_session -l python -u http://node02.conductor.iccmop:8995 -a u -k config
#{"conf": {"spark.ego.gpu.app":"true","spark.ego.gpu.mode":"default","spark.default.parallelism":8}}

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
5,,pyspark,idle,,,✔


SparkSession available as 'spark'.


{'conf': {'spark.default.parallelism': 8,
  'spark.ego.gpu.app': 'true',
  'spark.ego.gpu.mode': 'default'}}

In [3]:
%%spark -s gpu_session
from __future__ import print_function
import os
import time
import argparse
import sys

## SPARK
sys.path.append('/opt/DL/snap-ml-spark/lib/')

# data paths
# Please use the absolute path of the file if you wish to run the example on a distributed mode
data_path = '/shared/kelvin/snapml' 
filename = data_path + '/criteo.kaggle2014'
train_filename = filename + '-train.libsvm'
test_filename = filename + '-test.libsvm'

## snapML
os.environ["PYTHONPATH"] = '/opt/DL/snap-ml-spark/lib/' 
os.environ["SPARK_PYTHON_DIR"] = '/var/conductor/livy-integration/spark-2.3.1-hadoop-2.7/python'
sys.path.append('/opt/DL/snap-ml-spark/lib/')
sys.path.append('/usr/lib64/python2.7/site-packages')

from pyspark import SparkConf, SparkContext
from pyspark.sql.session import SparkSession

from snap_ml_spark import DatasetReader

from snap_ml_spark import LogisticRegression as snapml_LogisticRegression
from snap_ml_spark.Metrics import accuracy, logisticLoss

n_features_ = 1000000
print('n_features: %.d' %n_features_)

# Load training data
train_data = DatasetReader().setFormat("libsvm").setNumFt(n_features_).load(train_filename)

count1 = train_data.count()
print('count1: %.d' %count1)

# Load test data
test_data = DatasetReader().setFormat("libsvm").setNumFt(n_features_).load(test_filename)

count2 = train_data.count()
##print('count2: %.d' %count2)

# Create snapML Logistic Regression
snapml_regularizer = 10.0

snapml_lr = snapml_LogisticRegression(max_iter=20, regularizer=snapml_regularizer, verbose=False, dual=True, use_gpu=True, n_threads=-1, class_weights=None)

# Fit the model and time it
snapml_t0 = time.time()
snapml_lr.fit(train_data)
snapml_time = time.time() - snapml_t0

# Perform inference on test data
pred = snapml_lr.predict(test_data)

# Compute accuracy
snapml_accuracy  = accuracy(pred)

# Print off SnapML  result
print('snapML  accuracy: %.4f' %snapml_accuracy, ", time: %.2f" % snapml_time, 's')



n_features: 1000000
count1: 8
snapML  accuracy: 0.7850 , time: 18.55 s

In [None]:
%spark delete -s gpu_session

In [None]:
%spark cleanup