In [1]:
import os
import sys

os.chdir("C:/dataanalytics/python")
os.curdir

#Configure the environment . Set this up to the directory where spark is installed
if 'SPARK_HOME' not in os.environ:
    os.environ['SPARK_HOME'] = 'C:\\spark'
    
#create a variable for our root path
SPARK_HOME = os.environ['SPARK_HOME']

#Add the following paths to the system path. Please check your installation
#to make sure that these zip files actually exists. The names might change as
#versions change
sys.path.insert(0,os.path.join(SPARK_HOME,"python"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib","py4j-0.10.6-src.zip"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib","pyspark.zip"))
 
#Initialize a spark context
from pyspark import SparkContext
from pyspark import SparkConf

#optionally configure spark
conf = (SparkConf().setAppName("V2Maestros").setMaster("local[2]").set("spark.executor.memory", "1g"))

#Initalize spark context onl runs once
sc = SparkContext(conf=conf)

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import dataframe
from pyspark.sql import Row

In [3]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import HashingTF,Tokenizer,IDF
from pyspark.ml.classification import LogisticRegression

In [4]:
spark = SparkSession.builder.master("local").appName("featurextraction").config(conf=conf).getOrCreate()

In [6]:
#pereparing training data in form of a dataftrame
training = spark.createDataFrame([(1.0, Vectors.dense([0.0,1.1,0.1])),
                                  (0.0, Vectors.dense([2.0, 1.0,-1.0])),
                                  (0.0, Vectors.dense([2.0, 1.3,1.0])),
                                  (1.0, Vectors.dense([0.0,1.2,-0.5]))], ["label", "features"])

In [7]:
#creating a logistic regression model.This instance is an estimator
lr = LogisticRegression(maxIter=10, regParam=0.01)

In [8]:
#print out the parameters, documentation and any default values
print("Logisticregression parameter :\n" + lr.explainParams() + "\n")

Logisticregression parameter :
aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)
family: The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial (default: auto)
featuresCol: features column name. (default: features)
fitIntercept: whether to fit an intercept term. (default: True)
labelCol: label column name. (default: label)
lowerBoundsOnCoefficients: The lower bounds on coefficients if fitting under bound constrained optimization. The bound matrix must be compatible with the shape (1, number of features) for binomial regression, or (number of classes, number of features) for multinomial regression. (undefined)
lowerBoundsOnIntercepts: The lower bounds on intercepts if fitting under bound constrained optimization. The bou

In [9]:
#learn the logistice regression model
model1 = lr.fit(training)

In [10]:
#since the model is a tranfomer i.e produced by an estimator.this prints parameter (name:value) pairs
print("model 1 was using parameters:")
print(model1.extractParamMap())

model 1 was using parameters:
{Param(parent='LogisticRegression_43e7b61ebeb7f6ff7ebc', name='rawPredictionCol', doc='raw prediction (a.k.a. confidence) column name'): 'rawPrediction', Param(parent='LogisticRegression_43e7b61ebeb7f6ff7ebc', name='threshold', doc='threshold in binary classification prediction, in range [0, 1]'): 0.5, Param(parent='LogisticRegression_43e7b61ebeb7f6ff7ebc', name='featuresCol', doc='features column name'): 'features', Param(parent='LogisticRegression_43e7b61ebeb7f6ff7ebc', name='standardization', doc='whether to standardize the training features before fitting the model'): True, Param(parent='LogisticRegression_43e7b61ebeb7f6ff7ebc', name='regParam', doc='regularization parameter (>= 0)'): 0.01, Param(parent='LogisticRegression_43e7b61ebeb7f6ff7ebc', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty'): 0.0, Param(parent='LogisticRegression_43e7b61ebe

In [12]:
#aternatiely we can specify parameters in form of a dictionary format
paramMap={lr.maxIter:20}


In [16]:
paramMap[lr.maxIter]=30 #specifying parameters overridung the previous one


In [17]:
paramMap.update({lr.regParam:0.1,lr.threshold:0.55}) #specifying muliple parameters

In [18]:
#we can now combine paramMaps which ar python dictionaries
paramMap2={lr.probabilityCol:"myProbability"}#changing the output column name
paramMapCombined = paramMap.copy()
paramMapCombined.update(paramMap2)

In [19]:
#learnong a new model from the paranMapCombined paraneters. Overirdesd esrliesr parameters
model2 = lr.fit(training, paramMapCombined)
print("model2 was fit using parameters")
print(model2.extractParamMap())

model2 was fit using parameters
{Param(parent='LogisticRegression_43e7b61ebeb7f6ff7ebc', name='rawPredictionCol', doc='raw prediction (a.k.a. confidence) column name'): 'rawPrediction', Param(parent='LogisticRegression_43e7b61ebeb7f6ff7ebc', name='threshold', doc='threshold in binary classification prediction, in range [0, 1]'): 0.55, Param(parent='LogisticRegression_43e7b61ebeb7f6ff7ebc', name='featuresCol', doc='features column name'): 'features', Param(parent='LogisticRegression_43e7b61ebeb7f6ff7ebc', name='standardization', doc='whether to standardize the training features before fitting the model'): True, Param(parent='LogisticRegression_43e7b61ebeb7f6ff7ebc', name='regParam', doc='regularization parameter (>= 0)'): 0.1, Param(parent='LogisticRegression_43e7b61ebeb7f6ff7ebc', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty'): 0.0, Param(parent='LogisticRegression_43e7b61e

In [23]:
#preparing the test data set
test = spark.createDataFrame([
        (1.0, Vectors.dense([-1.0,1.5,1.3])),
        (0.0, Vectors.dense([3.0,2.0,-0.1])),
        (1.0, Vectors.dense([0.0,2.2,-1.5]))],["label","features"]
    )

In [24]:
#make predictions on the test data set using transformer.transform() method#logistic regression trsnaform wil only 'Feauters column
#Nb model2.trasform outputs a "myprobability column na,e we renamed
prediction = model2.transform(test)
result = prediction.select("features","label","myProbability", "prediction").collect()

In [25]:
for row in result:
    print("features=%s, label=%s -> prob=%s, prediction=%s" %(row.features,row.label,row.myProbability,row.prediction))

features=[-1.0,1.5,1.3], label=1.0 -> prob=[0.05707304171033977,0.9429269582896603], prediction=1.0
features=[3.0,2.0,-0.1], label=0.0 -> prob=[0.9238522311704088,0.07614776882959128], prediction=0.0
features=[0.0,2.2,-1.5], label=1.0 -> prob=[0.10972776114779119,0.8902722388522087], prediction=1.0


In [39]:
#pipeleines
from pyspark.ml import Pipeline

In [28]:
#preparing training documents
training = spark.createDataFrame([
        (0, " a b c d e spark", 1.0),
        (1,"b,d", 0.0),
        (2, "spark f g h",1.0),
        (3, "hadoop mapreduce", 0.0)
    ], ["id", "text","label"])

In [30]:
#configuring the ml pipelines of three stage:tokenizer, HAshingTF and Lr
tokenizer =Tokenizer(inputCol="text", outputCol="words")

In [31]:
hashingTF=HashingTF(inputCol=tokenizer.getOutputCol(),outputCol="features")

In [32]:
lr = LogisticRegression(maxIter=10, regParam=0.001)

In [40]:
pipeline = Pipeline(stages=[tokenizer,hashingTF,lr])

In [41]:
#fit rthe pipeline to training ndocument
model = pipeline.fit(training)

In [42]:
#preparing the test data set
test = spark.createDataFrame([
        (4, "spark i j k"),
        (5, " L m n"),
        (6, "spark hadoop spark"),
        (7 , "Apche hadoop")
    ], ["id", "text"])

In [43]:
#making predictions on text documents and print columns of interest
prediction = model.transform(test)

In [44]:
selected = prediction.select("id", "text", "probability", "prediction")

In [45]:
for row in selected.collect():
    rid,text,prob,prediction = row
    print("(%d,%s) --> prob=%s, prediction=%f" %(rid,text,str(prob),prediction))
    

(4,spark i j k) --> prob=[0.3560065012749446,0.6439934987250555], prediction=1.000000
(5, L m n) --> prob=[0.6419790574283502,0.35802094257164985], prediction=0.000000
(6,spark hadoop spark) --> prob=[0.3821838216142933,0.6178161783857067], prediction=1.000000
(7,Apche hadoop) --> prob=[0.9824720959475101,0.017527904052489846], prediction=0.000000
