In [1]:
import os
import sys

os.chdir("C:/dataanalytics/python")
os.curdir

#Configure the environment . Set this up to the directory where spark is installed
if 'SPARK_HOME' not in os.environ:
    os.environ['SPARK_HOME'] = 'C:\\spark'
    
#create a variable for our root path
SPARK_HOME = os.environ['SPARK_HOME']

#Add the following paths to the system path. Please check your installation
#to make sure that these zip files actually exists. The names might change as
#versions change
sys.path.insert(0,os.path.join(SPARK_HOME,"python"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib","py4j-0.10.6-src.zip"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib","pyspark.zip"))
 
#Initialize a spark context
from pyspark import SparkContext
from pyspark import SparkConf

#optionally configure spark
conf = (SparkConf().setAppName("ALS").setMaster("local[2]").set("spark.executor.memory", "1g"))

#Initalize spark context onl runs once
sc = SparkContext(conf=conf)

In [2]:
#cross validation
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import Tokenizer, HashingTF
from pyspark.ml.tuning import CrossValidator,ParamGridBuilder
from pyspark.sql import DataFrame, SparkSession

In [3]:
spark = SparkSession.builder.appName("CV").master("local").config(conf=conf).getOrCreate()

In [4]:
#prepare the  train dataframe
training = spark.createDataFrame([
    (0, "a b c d e spark", 1.0),
    (1, "b d", 0.0),
    (2, "spark f g h", 1.0),
    (3, "hadoop mapreduce", 0.0),
    (4, "b spark who", 1.0),
    (5, "g d a y", 0.0),
    (6, "spark fly", 1.0),
    (7, "was mapreduce", 0.0),
    (8, "e spark program", 1.0),
    (9, "a e c l", 0.0),
    (10, "spark compile", 1.0),
    (11, "hadoop software", 0.0)
], ["id", "text", "label"])

In [5]:
#configure the ML pipeline,  which consist of three stages, tokenier, hashingTf and lr
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10)
pipeline = Pipeline(stages=[tokenizer,hashingTF,lr])

In [13]:
#we now build anparamGrid parameter
paramGrid = ParamGridBuilder()\
            .addGrid(hashingTF.numFeatures,[10,100,1000])\
            .addGrid(lr.regParam, [0.1,0.01])\
            .build()
           

In [14]:
#creating crossvalidation
crossVal = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=3   )

In [15]:
crossVal

CrossValidator_42728c0ea3c6713ccef7

In [16]:
#run crossvalidation to fit our data and choose the best set of parameters
cvModel = crossVal.fit(training)

In [17]:
#prepare the test documents
test = spark.createDataFrame([
    (4, "spark i j k"),
    (5, "l m n"),
    (6, "mapreduce spark"),
    (7, "apache hadoop")
], ["id", "text"])

In [18]:
#make predictions on the test document
prediction = cvModel.transform(test)

In [19]:
selected = prediction.select("id","text","probability","prediction")
for row in selected.collect():
    print(row)

Row(id=4, text='spark i j k', probability=DenseVector([0.2581, 0.7419]), prediction=1.0)
Row(id=5, text='l m n', probability=DenseVector([0.9186, 0.0814]), prediction=0.0)
Row(id=6, text='mapreduce spark', probability=DenseVector([0.432, 0.568]), prediction=1.0)
Row(id=7, text='apache hadoop', probability=DenseVector([0.6766, 0.3234]), prediction=0.0)


In [20]:
#trainvalidation
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import LinearRegression
from pyspark.ml.tuning import  TrainValidationSplit

In [21]:
#loading the data
data = spark.read.format('libsvm').load(r'C:\spark\data\mllib\sample_linear_regression_data.txt')
data.collect()

[Row(label=-9.490009878824548, features=SparseVector(10, {0: 0.4551, 1: 0.3664, 2: -0.3826, 3: -0.4458, 4: 0.3311, 5: 0.8067, 6: -0.2624, 7: -0.4485, 8: -0.0727, 9: 0.5658})),
 Row(label=0.2577820163584905, features=SparseVector(10, {0: 0.8387, 1: -0.127, 2: 0.4998, 3: -0.2269, 4: -0.6452, 5: 0.1887, 6: -0.5805, 7: 0.6519, 8: -0.6556, 9: 0.1749})),
 Row(label=-4.438869807456516, features=SparseVector(10, {0: 0.5026, 1: 0.1421, 2: 0.16, 3: 0.505, 4: -0.9372, 5: -0.2842, 6: 0.6356, 7: -0.1646, 8: 0.9481, 9: 0.4268})),
 Row(label=-19.782762789614537, features=SparseVector(10, {0: -0.0389, 1: -0.4167, 2: 0.8997, 3: 0.641, 4: 0.2733, 5: -0.2618, 6: -0.2795, 7: -0.1307, 8: -0.0854, 9: -0.0546})),
 Row(label=-7.966593841555266, features=SparseVector(10, {0: -0.062, 1: 0.6546, 2: -0.6979, 3: 0.6677, 4: -0.0794, 5: -0.4389, 6: -0.6081, 7: -0.6415, 8: 0.7314, 9: -0.0268})),
 Row(label=-7.896274316726144, features=SparseVector(10, {0: -0.1581, 1: 0.2657, 2: 0.3997, 3: -0.3693, 4: 0.1432, 5: -0.25

In [23]:
train, test = data.randomSplit([0.9,0.1], seed=1234)

In [24]:
#model
lr = LinearRegression(maxIter=10)

In [25]:
#paramgridbuilder
paramGrid = ParamGridBuilder()\
            .addGrid(lr.regParam,[0.1,0.01])\
            .addGrid(lr.fitIntercept, [False,True])\
            .addGrid(lr.elasticNetParam, [0.0,0.5,1.0])\
            .build()

In [28]:
#creating he trainValidationsplit
tvs = TrainValidationSplit(estimator=lr,
                           estimatorParamMaps=paramGrid,
                           evaluator=RegressionEvaluator(),
                           trainRatio=0.80)

In [29]:
#training the model
model = tvs.fit(train)

In [30]:
#predictions
model.transform(test).select("features","label","prediction").show()

+--------------------+--------------------+--------------------+
|            features|               label|          prediction|
+--------------------+--------------------+--------------------+
|(10,[0,1,2,3,4,5,...| -28.571478869743427|-0.33469735687338814|
|(10,[0,1,2,3,4,5,...| -15.056482974542433|  1.4101966740275358|
|(10,[0,1,2,3,4,5,...| -14.328978509075442|  1.8916922148760926|
|(10,[0,1,2,3,4,5,...| -13.976130931152703|  0.2612650835303238|
|(10,[0,1,2,3,4,5,...|  -9.789294452221961|  1.4150794532758935|
|(10,[0,1,2,3,4,5,...|  -8.680225911784335|  0.8812376791467633|
|(10,[0,1,2,3,4,5,...|  -6.556192430758147|-0.18952551869167794|
|(10,[0,1,2,3,4,5,...| -6.3459370724834265|   2.802689397708973|
|(10,[0,1,2,3,4,5,...|  -5.615143641864686| -3.1863463103590863|
|(10,[0,1,2,3,4,5,...|  -4.706701061062994| -1.0520729378626856|
|(10,[0,1,2,3,4,5,...| -4.2775224863223915| 0.07911282895677055|
|(10,[0,1,2,3,4,5,...| -3.9916779937384743| -1.9744613262886295|
|(10,[0,1,2,3,4,5,...| -1