In [1]:
import os
import sys

os.chdir("C:/dataanalytics/python")
os.curdir

#Configure the environment . Set this up to the directory where spark is installed
if 'SPARK_HOME' not in os.environ:
    os.environ['SPARK_HOME'] = 'C:\\spark'
    
#create a variable for our root path
SPARK_HOME = os.environ['SPARK_HOME']

#Add the following paths to the system path. Please check your installation
#to make sure that these zip files actually exists. The names might change as
#versions change
sys.path.insert(0,os.path.join(SPARK_HOME,"python"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib","py4j-0.10.6-src.zip"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib","pyspark.zip"))
 
#Initialize a spark context
from pyspark import SparkContext
from pyspark import SparkConf

#optionally configure spark
conf = (SparkConf().setAppName("V2Maestros").setMaster("local[2]").set("spark.executor.memory", "1g"))

#Initalize spark context onl runs once
sc = SparkContext(conf=conf)

In [2]:
from pyspark.ml.classification import LogisticRegression, LogisticRegressionModel
from pyspark.sql import DataFrame, SparkSession

In [3]:
spark = SparkSession.builder.appName("Logreg").master("local").config(conf=conf).getOrCreate()

In [4]:
#loading the data
training = spark.read.format("libsvm").load("C:\spark\data\mllib\sample_libsvm_data.txt")

In [5]:
training.count()

100

In [6]:
#the model
lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

In [7]:
#Fitting the model
lrmodel = lr.fit(training)

In [8]:
#print the coefficients and intercept for logistic regression
print("Coeffiients:" + str(lrmodel.coefficients))
print("Intercept:" + str(lrmodel.intercept))

Coeffiients:(692,[244,263,272,300,301,328,350,351,378,379,405,406,407,428,433,434,455,456,461,462,483,484,489,490,496,511,512,517,539,540,568],[-7.353983524188197e-05,-9.102738505589466e-05,-0.00019467430546904298,-0.00020300642473486668,-3.1476183314863995e-05,-6.842977602660743e-05,1.5883626898239883e-05,1.4023497091372047e-05,0.00035432047524968605,0.00011443272898171087,0.00010016712383666666,0.0006014109303795481,0.0002840248179122762,-0.00011541084736508837,0.000385996886312906,0.000635019557424107,-0.00011506412384575676,-0.00015271865864986808,0.0002804933808994214,0.0006070117471191634,-0.0002008459663247437,-0.0001421075579290126,0.0002739010341160883,0.00027730456244968115,-9.838027027269332e-05,-0.0003808522443517704,-0.00025315198008555033,0.00027747714770754307,-0.0002443619763919199,-0.0015394744687597765,-0.00023073328411331293])
Intercept:0.22456315961250325


In [10]:
#Logostic regression can be used to fit multinomial regression
mlr = LogisticRegression(maxIter=10, elasticNetParam=0.8, regParam=0.3, family="multinomial")
#fitting the multinomial regression model
mlrmodel = mlr.fit(training)

In [12]:
#printing the coefficents and intercepts
#print the coefficients and intercept for logistic regression
print("Coeffiients:" + str(mlrmodel.coefficientMatrix))
print("Intercept:" + str(mlrmodel.interceptVector))

Coeffiients:2 X 692 CSRMatrix
(0,244) 0.0
(0,263) 0.0001
(0,272) 0.0001
(0,300) 0.0001
(0,350) -0.0
(0,351) -0.0
(0,378) -0.0
(0,379) -0.0
(0,405) -0.0
(0,406) -0.0006
(0,407) -0.0001
(0,428) 0.0001
(0,433) -0.0
(0,434) -0.0007
(0,455) 0.0001
(0,456) 0.0001
..
..
Intercept:[-0.12065879445860686,0.12065879445860686]


In [13]:
#extractt the summary from the returned logistic regressionmodel instance
trainingsummary = lrmodel.summary

In [15]:
#obtaining the objetive per iteration
objectivehistory = trainingsummary.objectiveHistory
for objective in objectivehistory:
    print(objective)

0.6833149135741672
0.6662875751473734
0.6217068546034618
0.6127265245887887
0.6060347986802873
0.6031750687571562
0.5969621534836274
0.5940743031983118
0.5906089243339022
0.5894724576491042
0.5882187775729587


In [1]:
import pandas as pd

In [16]:
#obtain the receiver operating characteristic curve
trainingsummary.roc.show()

+---+--------------------+
|FPR|                 TPR|
+---+--------------------+
|0.0|                 0.0|
|0.0|0.017543859649122806|
|0.0| 0.03508771929824561|
|0.0| 0.05263157894736842|
|0.0| 0.07017543859649122|
|0.0| 0.08771929824561403|
|0.0| 0.10526315789473684|
|0.0| 0.12280701754385964|
|0.0| 0.14035087719298245|
|0.0| 0.15789473684210525|
|0.0| 0.17543859649122806|
|0.0| 0.19298245614035087|
|0.0| 0.21052631578947367|
|0.0| 0.22807017543859648|
|0.0| 0.24561403508771928|
|0.0|  0.2631578947368421|
|0.0|  0.2807017543859649|
|0.0|  0.2982456140350877|
|0.0|  0.3157894736842105|
|0.0|  0.3333333333333333|
+---+--------------------+
only showing top 20 rows



In [17]:
print("areaUnderRoc:" + str(trainingsummary.areaUnderROC))

areaUnderRoc:1.0


In [18]:
#set the model threshold to maximize F-Measure
fmeasure = trainingsummary.fMeasureByThreshold

In [19]:
maxfmeasure = fmeasure.groupBy().max('F-Measure').select('max(F-Measure)').head()

In [20]:
bestthreshold = fmeasure.where(fmeasure['F-measure'] == maxfmeasure['max(F-Measure)'])\
.select('threshold').head()['threshold']


In [21]:
lr.setThreshold(bestthreshold)

LogisticRegression_437f901fbdf9d12a2643

In [23]:
#load training data
training1 = spark.read.format("libsvm").load("C:\spark\data\mllib\sample_multiclass_classification_data.txt")

In [26]:
#fit he model
lrmodel1 = lr.fit(training1)

In [27]:
trainingSummary = lrmodel1.summary

In [29]:
#obtaining he objective per iteration
objectivehistory = trainingSummary.objectiveHistory
print("ObjectiveHistory")
for objective in objectivehistory:
    print(objective)
    
#for multiclass we can inspect metrics on a per label basis
print("False positive rate by label")
for i, rate in enumerate(trainingSummary.falsePositiveRateByLabel):
    print("label %d: %s" %(i, rate))
    
print("True positive rate by label")
for i, rate in enumerate(trainingSummary.truePositiveRateByLabel):
    print("label %d: %s" %(i, rate))
    
print("precision by label")
for i, prec in enumerate(trainingSummary.precisionByLabel):
    print("label %d: %s" %(i, prec))
    
print("Recall by label")
for i, rec in enumerate(trainingSummary.recallByLabel):
    print("label %d: %s" %(i, rec))
    
print("F-Measure by label")
for i, rate in enumerate(trainingSummary.fMeasureByLabel()):
    print("label %d: %s" %(i, rate))    

ObjectiveHistory
1.098612288668108
1.087602085441699
1.0341156572156232
1.0289859520256006
1.0300389657358995
1.0239965158223991
1.0236097451839508
1.0231082121970012
1.023022220302788
1.0230018151780262
1.0229963739557606
False positive rate by label
label 0: 0.22
label 1: 0.05
label 2: 0.0
True positive rate by label
label 0: 1.0
label 1: 1.0
label 2: 0.46
precision by label
label 0: 0.6944444444444444
label 1: 0.9090909090909091
label 2: 1.0
Recall by label
label 0: 1.0
label 1: 1.0
label 2: 0.46
F-Measure by label
label 0: 0.819672131147541
label 1: 0.9523809523809523
label 2: 0.6301369863013699


In [30]:
accuracy = trainingSummary.accuracy
falsepositiverate =  trainingSummary.weightedFalsePositiveRate
truepositiverate = trainingSummary.weightedTruePositiveRate
fmeasure = trainingSummary.weightedFMeasure()
precision = trainingSummary.weightedPrecision
recall = trainingSummary.weightedRecall
print("accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
     %(accuracy, falsepositiverate, truepositiverate,fmeasure, precision,recall))

accuracy: 0.82
FPR: 0.09
TPR: 0.82
F-measure: 0.8007300232766211
Precision: 0.8678451178451179
Recall: 0.82


In [31]:
import tempfile

In [32]:
lrmodelsave = tempfile.mkdtemp(sc, lrmodel)

TypeError: unsupported operand type(s) for +: 'LogisticRegressionModel' and 'str'