In [1]:
!pip install pyspark



In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession \
        .builder \
        .appName("LogisticRegressionSummary") \
        .getOrCreate()

In [3]:
from pyspark.ml.classification import LogisticRegression

In [4]:
#file_location = "E:\BIG DATA WORKSPACE\Mini Project\Data.csv"
file_location = "Data\data.csv"
file_type = "csv"

# CSV options
infer_schema = True
first_row_is_header = False
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
rawstrokeDF = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

In [5]:
rawstrokeDF.show(5, False)

+---+---------------+---+---+---+---+--------+---------+-------+-------------+-----------------------------+--------------------+-----------------+------+-----+----+----+----+----+----+----+----+----+----+----+----------------------+----+----+----+----+----+----+----+----+----+----+----+
|_c0|_c1            |_c2|_c3|_c4|_c5|_c6     |_c7      |_c8    |_c9          |_c10                         |_c11                |_c12             |_c13  |_c14 |_c15|_c16|_c17|_c18|_c19|_c20|_c21|_c22|_c23|_c24|_c25                  |_c26|_c27|_c28|_c29|_c30|_c31|_c32|_c33|_c34|_c35|_c36|
+---+---------------+---+---+---+---+--------+---------+-------+-------------+-----------------------------+--------------------+-----------------+------+-----+----+----+----+----+----+----+----+----+----+----+----------------------+----+----+----+----+----+----+----+----+----+----+----+
|1  |Cash loans     |M  |N  |Y  |0  |202500.0|406597.5 |24700.5|Working      |Secondary / secondary special|Single / not married|Hous

In [6]:
from pyspark.sql.functions import isnull, when, count, col
rawstrokeDF.select([count(when(isnull(c), c)).alias(c) for c in rawstrokeDF.columns]).show()

+---+---+---+---+---+---+---+---+---+---+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
|_c0|_c1|_c2|_c3|_c4|_c5|_c6|_c7|_c8|_c9|_c10|_c11|_c12|_c13|_c14|_c15|_c16|_c17|_c18|_c19|_c20|_c21|_c22|_c23|_c24|_c25|_c26|_c27|_c28|_c29|_c30|_c31|_c32|_c33|_c34|_c35|_c36|
+---+---+---+---+---+---+---+---+---+---+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|
+---+---+---+---+---+---+---+---+---+---+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+



In [7]:
rawstrokeDF.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: integer (nullable = true)
 |-- _c6: double (nullable = true)
 |-- _c7: double (nullable = true)
 |-- _c8: double (nullable = true)
 |-- _c9: string (nullable = true)
 |-- _c10: string (nullable = true)
 |-- _c11: string (nullable = true)
 |-- _c12: string (nullable = true)
 |-- _c13: integer (nullable = true)
 |-- _c14: integer (nullable = true)
 |-- _c15: integer (nullable = true)
 |-- _c16: integer (nullable = true)
 |-- _c17: integer (nullable = true)
 |-- _c18: integer (nullable = true)
 |-- _c19: integer (nullable = true)
 |-- _c20: double (nullable = true)
 |-- _c21: integer (nullable = true)
 |-- _c22: integer (nullable = true)
 |-- _c23: integer (nullable = true)
 |-- _c24: integer (nullable = true)
 |-- _c25: string (nullable = true)
 |-- _c26: integer (nullable = true)
 |-- _c27: integer 

In [8]:
rawstrokeDF.count()

307497

In [9]:
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import col

# df2 = rawstrokeDF.withColumn("_c0",rawstrokeDF._c0.cast('double'))

In [10]:
intcols = ['_c0','_c5','_c13','_c14','_c15','_c16','_c17','_c18','_c19','_c21','_c22','_c23','_c24','_c26','_c27','_c28','_c29','_c30','_c31','_c32','_c33','_c34','_c35','_c36']
for col_name in intcols:
    rawstrokeDF=rawstrokeDF.withColumn(col_name,col(col_name).cast('double'))

In [11]:
rawstrokeDF.printSchema()

root
 |-- _c0: double (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: double (nullable = true)
 |-- _c6: double (nullable = true)
 |-- _c7: double (nullable = true)
 |-- _c8: double (nullable = true)
 |-- _c9: string (nullable = true)
 |-- _c10: string (nullable = true)
 |-- _c11: string (nullable = true)
 |-- _c12: string (nullable = true)
 |-- _c13: double (nullable = true)
 |-- _c14: double (nullable = true)
 |-- _c15: double (nullable = true)
 |-- _c16: double (nullable = true)
 |-- _c17: double (nullable = true)
 |-- _c18: double (nullable = true)
 |-- _c19: double (nullable = true)
 |-- _c20: double (nullable = true)
 |-- _c21: double (nullable = true)
 |-- _c22: double (nullable = true)
 |-- _c23: double (nullable = true)
 |-- _c24: double (nullable = true)
 |-- _c25: string (nullable = true)
 |-- _c26: double (nullable = true)
 |-- _c27: double (nullable = tru

In [12]:
type(rawstrokeDF)

pyspark.sql.dataframe.DataFrame

In [13]:
!pip install pyArrow




In [14]:
df1=rawstrokeDF.to_pandas_on_spark()




In [15]:
df1.columns = ["Target","NAME_CONTRACT_TYPE","CODE_GENDER","FLAG_OWN_CAR","FLAG_OWN_REALTY","CNT_CHILDREN","AMT_INCOME_TOTAL","AMT_CREDIT","AMT_ANNUITY",
             "NAME_INCOME_TYPE","NAME_EDUCATION_TYPE","NAME_FAMILY_STATUS","NAME_HOUSING_TYPE","DAYS_BIRTH","DAYS_EMPLOYED","OWN_CAR_AGE","FLAG_MOBIL",
             "FLAG_EMP_PHONE","FLAG_WORK_PHONE","FLAG_CONT_MOBILE","FLAG_PHONE","OCCUPATION_TYPE","CNT_FAM_MEMBERS","REGION_RATING_CLIENT",
             "REGION_RATING_CLIENT_W_CITY","REG_REGION_NOT_LIVE_REGION","REG_REGION_NOT_WORK_REGION","ORGANIZATION_TYPE","FLAG_DOCUMENT_2",
             "FLAG_DOCUMENT_3","FLAG_DOCUMENT_4","FLAG_DOCUMENT_5","FLAG_DOCUMENT_6","FLAG_DOCUMENT_7","FLAG_DOCUMENT_8","FLAG_DOCUMENT_9",
             "FLAG_DOCUMENT_10"]


In [16]:
df1.head(5)

Unnamed: 0,Target,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,ORGANIZATION_TYPE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10
0,1.0,Cash loans,M,N,Y,0.0,202500.0,406597.5,24700.5,Working,Secondary / secondary special,Single / not married,House / apartment,-9461.0,-637.0,1.0,1.0,0.0,1.0,1.0,1.0,2.0,2.0,0.0,0.0,Business Entity Type 3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,Cash loans,F,N,N,0.0,270000.0,1293502.5,35698.5,State servant,Higher education,Married,House / apartment,-16765.0,-1188.0,1.0,1.0,0.0,1.0,1.0,2.0,1.0,1.0,0.0,0.0,School,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,Revolving loans,M,Y,Y,0.0,67500.0,135000.0,6750.0,Working,Secondary / secondary special,Single / not married,House / apartment,-19046.0,-225.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,0.0,0.0,Government,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,Cash loans,F,N,Y,0.0,135000.0,312682.5,29686.5,Working,Secondary / secondary special,Civil marriage,House / apartment,-19005.0,-3039.0,1.0,1.0,0.0,1.0,0.0,2.0,2.0,2.0,0.0,0.0,Business Entity Type 3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,Cash loans,M,N,Y,0.0,121500.0,513000.0,21865.5,Working,Secondary / secondary special,Single / not married,House / apartment,-19932.0,-3038.0,1.0,1.0,0.0,1.0,0.0,1.0,2.0,2.0,0.0,0.0,Religion,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [17]:
type(df1)

pyspark.pandas.frame.DataFrame

In [18]:
rawstrokeDF = df1.to_spark()


In [19]:
type(rawstrokeDF)

pyspark.sql.dataframe.DataFrame

In [20]:
rawstrokeDF.printSchema()

root
 |-- Target: double (nullable = true)
 |-- NAME_CONTRACT_TYPE: string (nullable = true)
 |-- CODE_GENDER: string (nullable = true)
 |-- FLAG_OWN_CAR: string (nullable = true)
 |-- FLAG_OWN_REALTY: string (nullable = true)
 |-- CNT_CHILDREN: double (nullable = true)
 |-- AMT_INCOME_TOTAL: double (nullable = true)
 |-- AMT_CREDIT: double (nullable = true)
 |-- AMT_ANNUITY: double (nullable = true)
 |-- NAME_INCOME_TYPE: string (nullable = true)
 |-- NAME_EDUCATION_TYPE: string (nullable = true)
 |-- NAME_FAMILY_STATUS: string (nullable = true)
 |-- NAME_HOUSING_TYPE: string (nullable = true)
 |-- DAYS_BIRTH: double (nullable = true)
 |-- DAYS_EMPLOYED: double (nullable = true)
 |-- OWN_CAR_AGE: double (nullable = true)
 |-- FLAG_MOBIL: double (nullable = true)
 |-- FLAG_EMP_PHONE: double (nullable = true)
 |-- FLAG_WORK_PHONE: double (nullable = true)
 |-- FLAG_CONT_MOBILE: double (nullable = true)
 |-- FLAG_PHONE: double (nullable = true)
 |-- OCCUPATION_TYPE: double (nullable = tr

In [21]:
#from pyspark.ml.feature import OneHotEncoder,StringIndexer

In [22]:
# #Create StringIndexer Object 
# SI_NAME_CONTRACT_TYPE = StringIndexer(inputCol="NAME_CONTRACT_TYPE",outputCol="NAME_CONTRACT_TYPE_Index")
# SI_CODE_GENDER = StringIndexer(inputCol="CODE_GENDER",outputCol="CODE_GENDER_Index")
# SI_FLAG_OWN_CAR= StringIndexer(inputCol="FLAG_OWN_CAR",outputCol="FLAG_OWN_CAR_Index")
# SI_FLAG_OWN_REALTY = StringIndexer(inputCol="FLAG_OWN_REALTY",outputCol="FLAG_OWN_REALTY_Index")
# SI_NAME_INCOME_TYPE= StringIndexer(inputCol="NAME_INCOME_TYPE",outputCol="NAME_INCOME_TYPE_Index")
# SI_NAME_EDUCATION_TYPE= StringIndexer(inputCol="NAME_EDUCATION_TYPE",outputCol="NAME_EDUCATION_TYPE_Index")
# SI_NAME_FAMILY_STATUS = StringIndexer(inputCol="NAME_FAMILY_STATUS",outputCol="NAME_FAMILY_STATUS_Index")
# SI_NAME_HOUSING_TYPE = StringIndexer(inputCol="NAME_HOUSING_TYPE",outputCol="NAME_HOUSING_TYPE_Index")
# SI_REG_REGION_NOT_LIVE_REGION = StringIndexer(inputCol="REG_REGION_NOT_LIVE_REGION",outputCol="REG_REGION_NOT_LIVE_REGION_Index")


In [23]:
# #transform the data 
# rawstrokeDF = SI_NAME_CONTRACT_TYPE.fit(rawstrokeDF).transform(rawstrokeDF)
# rawstrokeDF = SI_CODE_GENDER.fit(rawstrokeDF).transform(rawstrokeDF)
# rawstrokeDF = SI_FLAG_OWN_CAR.fit(rawstrokeDF).transform(rawstrokeDF)
# rawstrokeDF = SI_FLAG_OWN_REALTY.fit(rawstrokeDF).transform(rawstrokeDF)
# rawstrokeDF = SI_NAME_INCOME_TYPE.fit(rawstrokeDF).transform(rawstrokeDF)
# rawstrokeDF = SI_NAME_EDUCATION_TYPE.fit(rawstrokeDF).transform(rawstrokeDF)
# rawstrokeDF = SI_NAME_FAMILY_STATUS.fit(rawstrokeDF).transform(rawstrokeDF)
# rawstrokeDF = SI_NAME_HOUSING_TYPE.fit(rawstrokeDF).transform(rawstrokeDF)
# rawstrokeDF = SI_REG_REGION_NOT_LIVE_REGION.fit(rawstrokeDF).transform(rawstrokeDF)


In [24]:
#OHE = OneHotEncoder(inputCols =["NAME_CONTRACT_TYPE_Index","CODE_GENDER_Index","FLAG_OWN_CAR_Index","FLAG_OWN_REALTY_Index","NAME_INCOME_TYPE_Index","NAME_EDUCATION_TYPE_Index","NAME_FAMILY_STATUS_Index","NAME_HOUSING_TYPE_Index","REG_REGION_NOT_LIVE_REGION_Index"], outputCols=["NAME_CONTRACT_TYPE_OHE","CODE_GENDER_OHE","FLAG_OWN_CAR_OHE","FLAG_OWN_REALTY_OHE","NAME_INCOME_TYPE_OHE","NAME_EDUCATION_TYPE_OHE","NAME_FAMILY_STATUS_OHE","NAME_HOUSING_TYPE_OHE","REG_REGION_NOT_LIVE_REGION_OHE"])

In [25]:
#rawstrokeDF= OHE.fit(rawstrokeDF).transform(rawstrokeDF)


In [26]:
#rawstrokeDF.show(5,False)

In [27]:
#view and transform data 
#rawstrokeDF.select("NAME_CONTRACT_TYPE_Index","CODE_GENDER_Index","FLAG_OWN_CAR_Index","FLAG_OWN_REALTY_Index","NAME_INCOME_TYPE_Index","NAME_EDUCATION_TYPE_Index","NAME_FAMILY_STATUS_Index","NAME_HOUSING_TYPE_Index","REG_REGION_NOT_LIVE_REGION_Index","NAME_CONTRACT_TYPE_OHE","CODE_GENDER_OHE","FLAG_OWN_CAR_OHE","FLAG_OWN_REALTY_OHE","NAME_INCOME_TYPE_OHE","NAME_EDUCATION_TYPE_OHE","NAME_FAMILY_STATUS_OHE","NAME_HOUSING_TYPE_OHE","REG_REGION_NOT_LIVE_REGION_OHE").show(10)


In [28]:
#from pyspark.ml.feature import VectorIndexer

In [29]:
#features_col =["Target","CNT_CHILDREN","AMT_INCOME_TOTAL","AMT_CREDIT","AMT_ANNUITY","DAYS_BIRTH","DAYS_EMPLOYED","OWN_CAR_AGE","FLAG_MOBIL","FLAG_EMP_PHONE","FLAG_WORK_PHONE","FLAG_CONT_MOBILE","FLAG_PHONE","OCCUPATION_TYPE","CNT_FAM_MEMBERS","REGION_RATING_CLIENT","REGION_RATING_CLIENT_W_CITY","REG_REGION_NOT_WORK_REGION","ORGANIZATION_TYPE","FLAG_DOCUMENT_2","FLAG_DOCUMENT_3","FLAG_DOCUMENT_4","FLAG_DOCUMENT_5","FLAG_DOCUMENT_6","FLAG_DOCUMENT_7","FLAG_DOCUMENT_8","FLAG_DOCUMENT_9","FLAG_DOCUMENT_10", "NAME_CONTRACT_TYPE_OHE","CODE_GENDER_OHE","FLAG_OWN_CAR_OHE","FLAG_OWN_REALTY_OHE","NAME_INCOME_TYPE_OHE","NAME_EDUCATION_TYPE_OHE","NAME_FAMILY_STATUS_OHE","NAME_HOUSING_TYPE_OHE","REG_REGION_NOT_LIVE_REGION_OHE"]


In [30]:
cat_features=["NAME_CONTRACT_TYPE","CODE_GENDER", "FLAG_OWN_CAR","FLAG_OWN_REALTY", "NAME_INCOME_TYPE","NAME_EDUCATION_TYPE", "NAME_FAMILY_STATUS","NAME_HOUSING_TYPE", "REG_REGION_NOT_LIVE_REGION"]

In [31]:
len(cat_features)

9

In [32]:
cont_features=["Target","CNT_CHILDREN","AMT_INCOME_TOTAL","AMT_CREDIT","AMT_ANNUITY","DAYS_BIRTH","DAYS_EMPLOYED","OWN_CAR_AGE","FLAG_MOBIL","FLAG_EMP_PHONE","FLAG_WORK_PHONE","FLAG_CONT_MOBILE","FLAG_PHONE","OCCUPATION_TYPE","CNT_FAM_MEMBERS","REGION_RATING_CLIENT","REGION_RATING_CLIENT_W_CITY","REG_REGION_NOT_WORK_REGION","ORGANIZATION_TYPE","FLAG_DOCUMENT_2","FLAG_DOCUMENT_3","FLAG_DOCUMENT_4","FLAG_DOCUMENT_5","FLAG_DOCUMENT_6","FLAG_DOCUMENT_7","FLAG_DOCUMENT_8","FLAG_DOCUMENT_9","FLAG_DOCUMENT_10"]


In [33]:
len(cont_features)

28

In [34]:
from pyspark.sql.functions import col

In [35]:
from pyspark.ml.feature import OneHotEncoder,StringIndexer,VectorAssembler


In [36]:
from pyspark.ml.feature import OneHotEncoder,StringIndexer,VectorAssembler

# defining an empty list to hold transforming stages
# to prepare pipelines
stages=[]


# Encoding categorical features
for catcol in cat_features:
    indexer=StringIndexer(inputCol=catcol,outputCol=catcol+'_index').setHandleInvalid("keep")
    encoder=OneHotEncoder(inputCols=[indexer.getOutputCol()],outputCols=[catcol+"_enc"])
    stages+=[indexer,encoder]

In [37]:
assemblerInputs=[col+"_enc" for col in cat_features]+cont_features
assembler=VectorAssembler(inputCols=assemblerInputs,outputCol="features")
stages+=[assembler]

In [38]:
# Scaling the features vector
from pyspark.ml.feature import MinMaxScaler
scaler = MinMaxScaler().setInputCol("features").setOutputCol("scaled_features")
stages+=[scaler]

In [39]:
# Building a spark ml pipeline to transform the data
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=stages)

# Fit the pipeline to training documents.
finalDF = pipeline.fit(rawstrokeDF).transform(rawstrokeDF)

In [40]:
finalDF.printSchema()

root
 |-- Target: double (nullable = true)
 |-- NAME_CONTRACT_TYPE: string (nullable = true)
 |-- CODE_GENDER: string (nullable = true)
 |-- FLAG_OWN_CAR: string (nullable = true)
 |-- FLAG_OWN_REALTY: string (nullable = true)
 |-- CNT_CHILDREN: double (nullable = true)
 |-- AMT_INCOME_TOTAL: double (nullable = true)
 |-- AMT_CREDIT: double (nullable = true)
 |-- AMT_ANNUITY: double (nullable = true)
 |-- NAME_INCOME_TYPE: string (nullable = true)
 |-- NAME_EDUCATION_TYPE: string (nullable = true)
 |-- NAME_FAMILY_STATUS: string (nullable = true)
 |-- NAME_HOUSING_TYPE: string (nullable = true)
 |-- DAYS_BIRTH: double (nullable = true)
 |-- DAYS_EMPLOYED: double (nullable = true)
 |-- OWN_CAR_AGE: double (nullable = true)
 |-- FLAG_MOBIL: double (nullable = true)
 |-- FLAG_EMP_PHONE: double (nullable = true)
 |-- FLAG_WORK_PHONE: double (nullable = true)
 |-- FLAG_CONT_MOBILE: double (nullable = true)
 |-- FLAG_PHONE: double (nullable = true)
 |-- OCCUPATION_TYPE: double (nullable = tr

In [41]:
trainDF, testDF =  finalDF.randomSplit([0.7,0.3], seed = 2020)

# print the count of observations in each set
print("Observations in training set = ", trainDF.count())
print("Observations in testing set = ", testDF.count())

Observations in training set =  215369
Observations in testing set =  92128


In [42]:
from pyspark.ml.classification import LogisticRegression

# Build the LogisticRegression object 'lr' by setting the required parameters
lr = LogisticRegression(featuresCol="features", labelCol="Target",maxIter= 10,regParam=0.3, elasticNetParam=0.8)

# fit the LogisticRegression object on the training data
lrmodel = lr.fit(trainDF)

In [43]:
predictonDF = lrmodel.transform(testDF)

In [44]:
predictonDF.select("Target","rawPrediction", "probability", "prediction").show(10,False)

+------+----------------------------------------+----------------------------------------+----------+
|Target|rawPrediction                           |probability                             |prediction|
+------+----------------------------------------+----------------------------------------+----------+
|0.0   |[2.5109679034452017,-2.5109679034452017]|[0.9249071428598314,0.07509285714016856]|0.0       |
|0.0   |[2.5109679034452017,-2.5109679034452017]|[0.9249071428598314,0.07509285714016856]|0.0       |
|0.0   |[2.5109679034452017,-2.5109679034452017]|[0.9249071428598314,0.07509285714016856]|0.0       |
|0.0   |[2.5109679034452017,-2.5109679034452017]|[0.9249071428598314,0.07509285714016856]|0.0       |
|0.0   |[2.5109679034452017,-2.5109679034452017]|[0.9249071428598314,0.07509285714016856]|0.0       |
|0.0   |[2.5109679034452017,-2.5109679034452017]|[0.9249071428598314,0.07509285714016856]|0.0       |
|0.0   |[2.5109679034452017,-2.5109679034452017]|[0.9249071428598314,0.07509285714

In [45]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Build the BinaryClassificationEvaluator object 'evaluator'
evaluator = BinaryClassificationEvaluator()

# Calculate the accracy and print its value
accuracy = predictonDF.filter(predictonDF.Target == predictonDF.prediction).count()/float(predictonDF.count())
print("Accuracy = ", accuracy)

# evaluate(predictiondataframe) gets area under the ROC curve
#print('Area under the ROC curve = ', evaluator.evaluate(predictonDF))

Accuracy =  0.9202414032650226


In [46]:
# Create model summary object
lrmodelSummary = lrmodel.summary

# Print the following metrics one by one: 
# 1. Accuracy
# Accuracy is a model summary parameter
print("Accuracy = ", lrmodelSummary.accuracy)
# 2. Area under the ROC curve
# Area under the ROC curve is a model summary parameter
print("Area under the ROC curve = ", lrmodelSummary.areaUnderROC)
# 3. Precision (Positive Predictive Value)
# Precision is a model summary parameter
print("Precision = ", lrmodelSummary.weightedPrecision)
# 4. Recall (True Positive Rate)
# Recall is a model summary parameter
print("Recall = ", lrmodelSummary.weightedRecall)
# 5. F1 Score (F-measure)
# F1 Score is a model summary method
print("F1 Score = ", lrmodelSummary.weightedFMeasure())


Accuracy =  0.91885090240471
Area under the ROC curve =  1.0
Precision =  0.8442869808499499
Recall =  0.91885090240471
F1 Score =  0.879992270156985


In [47]:
pip install kafka-python

Collecting kafka-python
  Downloading kafka_python-2.0.2-py2.py3-none-any.whl (246 kB)Note: you may need to restart the kernel to use updated packages.

Installing collected packages: kafka-python
Successfully installed kafka-python-2.0.2


In [49]:
!pip install zookeeper

Collecting zookeeper
  Downloading zookeeper-1.3.2-py3-none-any.whl (43 kB)
Collecting tensorflow-datasets<4.5.0,>=1.3.0
  Downloading tensorflow_datasets-4.4.0-py3-none-any.whl (4.0 MB)
Collecting typeguard<2.13.0,>=2.5.1
  Downloading typeguard-2.12.1-py3-none-any.whl (17 kB)
Collecting dill
  Downloading dill-0.3.4-py2.py3-none-any.whl (86 kB)
Collecting protobuf>=3.12.2
  Downloading protobuf-3.19.4-cp38-cp38-win_amd64.whl (895 kB)
Collecting tensorflow-metadata
  Downloading tensorflow_metadata-1.6.0-py3-none-any.whl (48 kB)
Collecting importlib-resources
  Downloading importlib_resources-5.4.0-py3-none-any.whl (28 kB)
Collecting promise
  Downloading promise-2.3.tar.gz (19 kB)
Collecting termcolor
  Using cached termcolor-1.1.0-py3-none-any.whl
Collecting absl-py
  Using cached absl_py-1.0.0-py3-none-any.whl (126 kB)
Collecting googleapis-common-protos<2,>=1.52.0
  Downloading googleapis_common_protos-1.55.0-py2.py3-none-any.whl (212 kB)
Building wheels for collected packages: pr