In [1]:
# Must be included at the beginning of each new notebook.
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Data_Mining_GlobalTerrorism').getOrCreate()

# Importing complete dataset

In [2]:
df = spark.read.csv('Complete_dataset.csv', header=True, inferSchema=True)

In [3]:
df.printSchema()

root
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- Date: integer (nullable = true)
 |-- Country: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Province: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Attack: string (nullable = true)
 |-- Target: string (nullable = true)
 |-- Nationality: string (nullable = true)
 |-- Group: string (nullable = true)
 |-- Individual: integer (nullable = true)
 |-- Weapon: string (nullable = true)
 |-- Ishostkid: double (nullable = true)



In [4]:
from pyspark.sql import *
spark = SparkSession.builder.appName('Final1').getOrCreate()
# Print data columns.
df.columns

['Year',
 'Month',
 'Date',
 'Country',
 'Region',
 'Province',
 'City',
 'Attack',
 'Target',
 'Nationality',
 'Group',
 'Individual',
 'Weapon',
 'Ishostkid']

# Data Transformation - encoding

In [5]:
from pyspark.ml.feature import (VectorAssembler,VectorIndexer,OneHotEncoder,StringIndexer)

# Goal 2

In [6]:
#indexing

Country_indexer = StringIndexer(inputCol='Country',outputCol='CountryIndex')
Region_indexer = StringIndexer(inputCol='Region',outputCol='RegionIndex')
Province_indexer = StringIndexer(inputCol='Province',outputCol='ProvinceIndex')
City_indexer = StringIndexer(inputCol='City',outputCol='CityIndex')
Target_indexer = StringIndexer(inputCol='Target',outputCol='TargetIndex')
Nationality_indexer = StringIndexer(inputCol='Nationality',outputCol='NationalityIndex')
Group_indexer = StringIndexer(inputCol='Group',outputCol='GroupIndex')
Weapon_indexer = StringIndexer(inputCol='Weapon',outputCol='WeaponIndex')
Attack_indexer = StringIndexer(inputCol='Attack',outputCol='label')

#encoding

Country_encoder = OneHotEncoder(inputCol='CountryIndex',outputCol='CountryVec')
Region_encoder = OneHotEncoder(inputCol='RegionIndex',outputCol='RegionVec')
Province_encoder = OneHotEncoder(inputCol='ProvinceIndex',outputCol='ProvinceVec')
City_encoder = OneHotEncoder(inputCol='CityIndex',outputCol='CityVec')
Target_encoder = OneHotEncoder(inputCol='TargetIndex',outputCol='TargetVec')
Nationality_encoder = OneHotEncoder(inputCol='NationalityIndex',outputCol='NationalityVec')
Group_encoder = OneHotEncoder(inputCol='GroupIndex',outputCol='GroupVec')
Weapon_encoder = OneHotEncoder(inputCol='WeaponIndex',outputCol='WeaponVec')

label_encoder = OneHotEncoder(inputCol='label',outputCol='label')



In [7]:
# Now we can assemble all of this as one vector in the features column. 
assembler = VectorAssembler(inputCols=['Year','CountryVec',
 'RegionVec',
 'ProvinceVec',
 'CityVec',
 'TargetVec',
 'NationalityVec',
 'GroupVec',
 'WeaponVec'],outputCol='features')

In [8]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[Country_indexer,Region_indexer,Province_indexer,City_indexer,Target_indexer,
                            Nationality_indexer,Group_indexer,Weapon_indexer,Attack_indexer,Country_encoder,Region_encoder,
                            Province_encoder,City_encoder,Target_encoder,Nationality_encoder,Group_encoder,Weapon_encoder,assembler])

# Pipeline

In [9]:
pipeline_model = pipeline.fit(df)
pipe_df = pipeline_model.transform(df)
pipe_df = pipe_df.select('label','features')
pipe_df.describe()

DataFrame[summary: string, label: string]

# Splitting of data

In [10]:
train_data, test_data = pipe_df.randomSplit([0.7,0.3])
print("Training Dataset Count: " + str(train_data.count()))
print("Test Dataset Count: " + str(test_data.count()))

Training Dataset Count: 78617
Test Dataset Count: 33239


# ML Algorithms

In [11]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import *
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import DecisionTreeClassifier,GBTClassifier,RandomForestClassifier
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, MultilayerPerceptronClassifier, LogisticRegression
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer, VectorAssembler, Normalizer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
import pandas as pd
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# Random Forest

In [12]:
rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'label')
rfModel = rf.fit(train_data)
predictions = rfModel.transform(test_data)

In [13]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [14]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

Test set accuracy = 0.5264598814645447


# GBT & Decision Tree

In [None]:
dtc = DecisionTreeClassifier(labelCol='label',featuresCol='features')
gbt = GBTClassifier(labelCol='label',featuresCol='features')
# Train the models (it's two models, so it might take some time).
dtc_model = dtc.fit(train_data)
gbt_model = gbt.fit(train_data)
dtc_predictions = dtc_model.transform(test_data)
gbt_predictions = gbt_model.transform(test_data)

# Let's start off with binary classification.
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Note that the label column isn't named label, it's named PrivateIndex in this case.
my_binary_eval = BinaryClassificationEvaluator(labelCol = 'label')

# This is the area under the curve. This indicates that the data is highly seperable.
print("DTC Accuracy")
print(my_binary_eval.evaluate(dtc_predictions))

# We can't repeat these exact steps for GBT. If you print the schema of all three, you may be able to notice why.
# Instead, let's redefine the object:
my_binary_gbt_eval = BinaryClassificationEvaluator(labelCol='label', rawPredictionCol='prediction')
print("GBT Accuracy")
print(my_binary_gbt_eval.evaluate(gbt_predictions))

# Naive Bayes

In [15]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# create the trainer and set its parameters
nb = NaiveBayes(smoothing=0.5, modelType="multinomial")

# train the model
model = nb.fit(train_data)

# select example rows to display.
predictions = model.transform(test_data)
#predictions.show()

# compute accuracy on the test set
#evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
#print("Test set accuracy = " + str(accuracy))

evaluator = BinaryClassificationEvaluator()
print("Test Area Under ROC: " + str(evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})))

Test Area Under ROC: 0.4533151068425548


In [16]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# create the trainer and set its parameters
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")

# train the model
model = nb.fit(train_data)

# select example rows to display.
predictions = model.transform(test_data)
#predictions.show()

# compute accuracy on the test set
#evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
#print("Test set accuracy = " + str(accuracy))

evaluator = BinaryClassificationEvaluator()
print("Test Area Under ROC: " + str(evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})))

Test Area Under ROC: 0.4555729468803198


# Logistic Regression

In [17]:
##Logistics Regression##
from pyspark.ml.classification import LogisticRegression

# Split our data. Note that the new DataFrame is being used.
train_data, test_data = pipe_df.randomSplit([0.8,0.2])
print("Training Dataset Count: " + str(train_data.count()))
print("Test Dataset Count: " + str(test_data.count()))

# Instantiate the model.
lr_model = LogisticRegression(featuresCol='features',labelCol='label')

# Fit the model.
lr_model = lr_model.fit(train_data)

# And evaluate the model using the test data.
predictions = lr_model.transform(test_data)

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

Training Dataset Count: 89687
Test Dataset Count: 22169
Test set accuracy = 0.8098245297487483


In [18]:
##Logistics Regression##
from pyspark.ml.classification import LogisticRegression

# Split our data. Note that the new DataFrame is being used.
train_data, test_data = pipe_df.randomSplit([0.7,0.3])
print("Training Dataset Count: " + str(train_data.count()))
print("Test Dataset Count: " + str(test_data.count()))

# Instantiate the model.
lr_model = LogisticRegression(featuresCol='features',labelCol='label')

# Fit the model.
lr_model = lr_model.fit(train_data)

# And evaluate the model using the test data.
predictions = lr_model.transform(test_data)

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

Training Dataset Count: 78472
Test Dataset Count: 33384
Test set accuracy = 0.810567936736161


# One Vs Rest

In [19]:
from pyspark.ml.classification import LogisticRegression, OneVsRest
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


# generate the train/test split.
(train_data, test_data) = pipe_df.randomSplit([0.7, 0.3])

# instantiate the base classifier.
lr = LogisticRegression(maxIter=5, tol=1E-6, fitIntercept=True)

# instantiate the One Vs Rest Classifier.
ovr = OneVsRest(classifier=lr)

# train the multiclass model.
ovrModel = ovr.fit(train_data)

# score the model on test data.
predictions = ovrModel.transform(test_data)

# obtain evaluator.
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

# compute the classification error on test data.
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

Test set accuracy = 0.7414091914181558


# Parameter tuning

In [20]:
train_data, test_data = pipe_df.randomSplit([0.8,0.2])
print("Training Dataset Count: " + str(train_data.count()))
print("Test Dataset Count: " + str(test_data.count()))

Training Dataset Count: 89438
Test Dataset Count: 22418


# Random Forest

In [21]:
rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'label')
rfModel = rf.fit(train_data)
predictions = rfModel.transform(test_data)

In [22]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [23]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

Test set accuracy = 0.5265411722722813


# GBT & Decision tree

In [None]:
dtc = DecisionTreeClassifier(labelCol='label',featuresCol='features')
gbt = GBTClassifier(labelCol='label',featuresCol='features')
# Train the models (it's two models, so it might take some time).
dtc_model = dtc.fit(train_data)
gbt_model = gbt.fit(train_data)
dtc_predictions = dtc_model.transform(test_data)
gbt_predictions = gbt_model.transform(test_data)

# Let's start off with binary classification.
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Note that the label column isn't named label, it's named PrivateIndex in this case.
my_binary_eval = BinaryClassificationEvaluator(labelCol = 'label')

# This is the area under the curve. This indicates that the data is highly seperable.
print("DTC Accuracy")
print(my_binary_eval.evaluate(dtc_predictions))

# We can't repeat these exact steps for GBT. If you print the schema of all three, you may be able to notice why.
# Instead, let's redefine the object:
my_binary_gbt_eval = BinaryClassificationEvaluator(labelCol='label', rawPredictionCol='prediction')
print("GBT Accuracy")
print(my_binary_gbt_eval.evaluate(gbt_predictions))

# Naive Bayes

In [24]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# create the trainer and set its parameters
nb = NaiveBayes(smoothing=0.5, modelType="multinomial")

# train the model
model = nb.fit(train_data)

# select example rows to display.
predictions = model.transform(test_data)
#predictions.show()

# compute accuracy on the test set
#evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
#print("Test set accuracy = " + str(accuracy))

evaluator = BinaryClassificationEvaluator()
print("Test Area Under ROC: " + str(evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})))

Test Area Under ROC: 0.45337181501743506


In [25]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# create the trainer and set its parameters
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")

# train the model
model = nb.fit(train_data)

# select example rows to display.
predictions = model.transform(test_data)
#predictions.show()

# compute accuracy on the test set
#evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
#print("Test set accuracy = " + str(accuracy))

evaluator = BinaryClassificationEvaluator()
print("Test Area Under ROC: " + str(evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})))

Test Area Under ROC: 0.4553144764716485


# One Vs Rest

In [26]:
from pyspark.ml.classification import LogisticRegression, OneVsRest
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


# generate the train/test split.
(train_data, test_data) = pipe_df.randomSplit([0.8, 0.2])

# instantiate the base classifier.
lr = LogisticRegression(maxIter=5, tol=1E-6, fitIntercept=True)

# instantiate the One Vs Rest Classifier.
ovr = OneVsRest(classifier=lr)

# train the multiclass model.
ovrModel = ovr.fit(train_data)

# score the model on test data.
predictions = ovrModel.transform(test_data)

# obtain evaluator.
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

# compute the classification error on test data.
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

Test set accuracy = 0.7413537881846319


# Goal 1

In [27]:
#indexing
Region_indexer = StringIndexer(inputCol='Region',outputCol='RegionIndex')
Province_indexer = StringIndexer(inputCol='Province',outputCol='ProvinceIndex')
City_indexer = StringIndexer(inputCol='City',outputCol='CityIndex')
Target_indexer = StringIndexer(inputCol='Target',outputCol='TargetIndex')
Nationality_indexer = StringIndexer(inputCol='Nationality',outputCol='NationalityIndex')
Group_indexer = StringIndexer(inputCol='Group',outputCol='GroupIndex')
Weapon_indexer = StringIndexer(inputCol='Weapon',outputCol='WeaponIndex')
Attack_indexer = StringIndexer(inputCol='Attack',outputCol='AttackIndex')

Country_indexer = StringIndexer(inputCol='Country',outputCol='label')

#encoding

Region_encoder = OneHotEncoder(inputCol='RegionIndex',outputCol='RegionVec')
Province_encoder = OneHotEncoder(inputCol='ProvinceIndex',outputCol='ProvinceVec')
City_encoder = OneHotEncoder(inputCol='CityIndex',outputCol='CityVec')
Target_encoder = OneHotEncoder(inputCol='TargetIndex',outputCol='TargetVec')
Nationality_encoder = OneHotEncoder(inputCol='NationalityIndex',outputCol='NationalityVec')
Group_encoder = OneHotEncoder(inputCol='GroupIndex',outputCol='GroupVec')
Weapon_encoder = OneHotEncoder(inputCol='WeaponIndex',outputCol='WeaponVec')
Attack_encoder = OneHotEncoder(inputCol='AttackIndex',outputCol='AttackVec')

label_encoder = OneHotEncoder(inputCol='label',outputCol='label')


In [28]:
# Now we can assemble all of this as one vector in the features column. 
assembler = VectorAssembler(inputCols=['Year',
 'RegionVec',
 'ProvinceVec',
 'CityVec',
 'NationalityVec'],outputCol='features')

In [29]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[Region_indexer,Province_indexer,City_indexer,
                            Nationality_indexer,Country_indexer,Region_encoder,
                            Province_encoder,City_encoder,Nationality_encoder,assembler])

# Pipeline

In [30]:
pipeline_model = pipeline.fit(df)
pipe_df = pipeline_model.transform(df)
pipe_df = pipe_df.select('label','features')
pipe_df.describe()

DataFrame[summary: string, label: string]

# Splitting the Data

In [31]:
train_data, test_data = pipe_df.randomSplit([0.7,0.3])
print("Training Dataset Count: " + str(train_data.count()))
print("Test Dataset Count: " + str(test_data.count()))

Training Dataset Count: 78274
Test Dataset Count: 33582


In [32]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import *
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import DecisionTreeClassifier,GBTClassifier,RandomForestClassifier
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, MultilayerPerceptronClassifier, LogisticRegression
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer, VectorAssembler, Normalizer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
import pandas as pd
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# Random Forest

In [33]:
rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'label')
rfModel = rf.fit(train_data)
predictions = rfModel.transform(test_data)

In [34]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [35]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

Test set accuracy = 0.24656065749508665


# GBT Classifier & Decision Tree

In [None]:
dtc = DecisionTreeClassifier(labelCol='label',featuresCol='features')
gbt = GBTClassifier(labelCol='label',featuresCol='features')
# Train the models (it's two models, so it might take some time).
dtc_model = dtc.fit(train_data)
gbt_model = gbt.fit(train_data)
dtc_predictions = dtc_model.transform(test_data)
gbt_predictions = gbt_model.transform(test_data)

# Let's start off with binary classification.
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Note that the label column isn't named label, it's named PrivateIndex in this case.
my_binary_eval = BinaryClassificationEvaluator(labelCol = 'label')

# This is the area under the curve. This indicates that the data is highly seperable.
print("DTC Accuracy")
print(my_binary_eval.evaluate(dtc_predictions))

# We can't repeat these exact steps for GBT. If you print the schema of all three, you may be able to notice why.
# Instead, let's redefine the object:
my_binary_gbt_eval = BinaryClassificationEvaluator(labelCol='label', rawPredictionCol='prediction')
print("GBT Accuracy")
print(my_binary_gbt_eval.evaluate(gbt_predictions))

# Naive Bayes

In [36]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# create the trainer and set its parameters
nb = NaiveBayes(smoothing=0.5, modelType="multinomial")

# train the model
model = nb.fit(train_data)

# select example rows to display.
predictions = model.transform(test_data)
#predictions.show()

# compute accuracy on the test set
#evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
#print("Test set accuracy = " + str(accuracy))

evaluator = BinaryClassificationEvaluator()
print("Test Area Under ROC: " + str(evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})))

Test Area Under ROC: 0.44646834732211993


In [37]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# create the trainer and set its parameters
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")

# train the model
model = nb.fit(train_data)

# select example rows to display.
predictions = model.transform(test_data)
#predictions.show()

# compute accuracy on the test set
#evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
#print("Test set accuracy = " + str(accuracy))

evaluator = BinaryClassificationEvaluator()
print("Test Area Under ROC: " + str(evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})))

Test Area Under ROC: 0.44658278397486634


# One Vs Rest

In [None]:
from pyspark.ml.classification import LogisticRegression, OneVsRest
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


# generate the train/test split.
(train_data, test_data) = pipe_df.randomSplit([0.8, 0.2])

# instantiate the base classifier.
lr = LogisticRegression(maxIter=5, tol=1E-6, fitIntercept=True)

# instantiate the One Vs Rest Classifier.
ovr = OneVsRest(classifier=lr)

# train the multiclass model.
ovrModel = ovr.fit(train_data)

# score the model on test data.
predictions = ovrModel.transform(test_data)

# obtain evaluator.
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

# compute the classification error on test data.
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

# Logistic Regression

In [1]:
##Logistics Regression##
from pyspark.ml.classification import LogisticRegression

# Split our data. Note that the new DataFrame is being used.
train_data, test_data = pipe_df.randomSplit([0.7,0.3])
print("Training Dataset Count: " + str(train_data.count()))
print("Test Dataset Count: " + str(test_data.count()))



ImportError: No module named 'pyspark'

In [39]:
# Instantiate the model.
lr_model = LogisticRegression(featuresCol='features',labelCol='label')

In [None]:
# Fit the model.
lr_model = lr_model.fit(train_data)

Traceback (most recent call last):
  File "/usr/lib/python3.5/socketserver.py", line 313, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/usr/lib/python3.5/socketserver.py", line 341, in process_request
    self.finish_request(request, client_address)
  File "/usr/lib/python3.5/socketserver.py", line 354, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/usr/lib/python3.5/socketserver.py", line 681, in __init__
    self.handle()
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/accumulators.py", line 235, in handle
    num_updates = read_int(self.rfile)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/serializers.py", line 577, in read_int
    raise EOFError
EOFError
--- Logging error ---
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:42892)
Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.5/site-packages/IPython/

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:42892)
Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-40-80b97830a0f2>", line 2, in <module>
    lr_model = lr_model.fit(train_data)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/base.py", line 64, in fit
    return self._fit(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 236, in _fit
    java_model = self._fit_java(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 233, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  Fil

--- Logging error ---
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:42892)
Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-40-80b97830a0f2>", line 2, in <module>
    lr_model = lr_model.fit(train_data)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/base.py", line 64, in fit
    return self._fit(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 236, in _fit
    java_model = self._fit_java(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 233, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__
    answer, self.gateway_client, self.targe

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:42892)
Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-40-80b97830a0f2>", line 2, in <module>
    lr_model = lr_model.fit(train_data)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/base.py", line 64, in fit
    return self._fit(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 236, in _fit
    java_model = self._fit_java(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 233, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  Fil

--- Logging error ---
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:42892)
Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-40-80b97830a0f2>", line 2, in <module>
    lr_model = lr_model.fit(train_data)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/base.py", line 64, in fit
    return self._fit(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 236, in _fit
    java_model = self._fit_java(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 233, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__
    answer, self.gateway_client, self.targe

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:42892)
Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-40-80b97830a0f2>", line 2, in <module>
    lr_model = lr_model.fit(train_data)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/base.py", line 64, in fit
    return self._fit(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 236, in _fit
    java_model = self._fit_java(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 233, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  Fil

--- Logging error ---
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:42892)
Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-40-80b97830a0f2>", line 2, in <module>
    lr_model = lr_model.fit(train_data)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/base.py", line 64, in fit
    return self._fit(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 236, in _fit
    java_model = self._fit_java(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 233, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__
    answer, self.gateway_client, self.targe

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:42892)
Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-40-80b97830a0f2>", line 2, in <module>
    lr_model = lr_model.fit(train_data)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/base.py", line 64, in fit
    return self._fit(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 236, in _fit
    java_model = self._fit_java(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 233, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  Fil

--- Logging error ---
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:42892)
Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-40-80b97830a0f2>", line 2, in <module>
    lr_model = lr_model.fit(train_data)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/base.py", line 64, in fit
    return self._fit(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 236, in _fit
    java_model = self._fit_java(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 233, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__
    answer, self.gateway_client, self.targe

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:42892)
Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-40-80b97830a0f2>", line 2, in <module>
    lr_model = lr_model.fit(train_data)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/base.py", line 64, in fit
    return self._fit(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 236, in _fit
    java_model = self._fit_java(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 233, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  Fil

--- Logging error ---
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:42892)
Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-40-80b97830a0f2>", line 2, in <module>
    lr_model = lr_model.fit(train_data)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/base.py", line 64, in fit
    return self._fit(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 236, in _fit
    java_model = self._fit_java(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 233, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__
    answer, self.gateway_client, self.targe

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:42892)
Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-40-80b97830a0f2>", line 2, in <module>
    lr_model = lr_model.fit(train_data)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/base.py", line 64, in fit
    return self._fit(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 236, in _fit
    java_model = self._fit_java(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 233, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  Fil

--- Logging error ---
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:42892)
Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-40-80b97830a0f2>", line 2, in <module>
    lr_model = lr_model.fit(train_data)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/base.py", line 64, in fit
    return self._fit(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 236, in _fit
    java_model = self._fit_java(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 233, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__
    answer, self.gateway_client, self.targe

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:42892)
Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-40-80b97830a0f2>", line 2, in <module>
    lr_model = lr_model.fit(train_data)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/base.py", line 64, in fit
    return self._fit(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 236, in _fit
    java_model = self._fit_java(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 233, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  Fil

--- Logging error ---
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:42892)
Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-40-80b97830a0f2>", line 2, in <module>
    lr_model = lr_model.fit(train_data)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/base.py", line 64, in fit
    return self._fit(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 236, in _fit
    java_model = self._fit_java(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 233, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__
    answer, self.gateway_client, self.targe

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:42892)
Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-40-80b97830a0f2>", line 2, in <module>
    lr_model = lr_model.fit(train_data)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/base.py", line 64, in fit
    return self._fit(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 236, in _fit
    java_model = self._fit_java(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 233, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  Fil

--- Logging error ---
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:42892)
Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-40-80b97830a0f2>", line 2, in <module>
    lr_model = lr_model.fit(train_data)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/base.py", line 64, in fit
    return self._fit(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 236, in _fit
    java_model = self._fit_java(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 233, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__
    answer, self.gateway_client, self.targe

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:42892)
Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-40-80b97830a0f2>", line 2, in <module>
    lr_model = lr_model.fit(train_data)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/base.py", line 64, in fit
    return self._fit(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 236, in _fit
    java_model = self._fit_java(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 233, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  Fil

--- Logging error ---
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:42892)
Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-40-80b97830a0f2>", line 2, in <module>
    lr_model = lr_model.fit(train_data)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/base.py", line 64, in fit
    return self._fit(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 236, in _fit
    java_model = self._fit_java(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 233, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__
    answer, self.gateway_client, self.targe

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:42892)
Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-40-80b97830a0f2>", line 2, in <module>
    lr_model = lr_model.fit(train_data)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/base.py", line 64, in fit
    return self._fit(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 236, in _fit
    java_model = self._fit_java(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 233, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  Fil

--- Logging error ---
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:42892)
Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-40-80b97830a0f2>", line 2, in <module>
    lr_model = lr_model.fit(train_data)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/base.py", line 64, in fit
    return self._fit(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 236, in _fit
    java_model = self._fit_java(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 233, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__
    answer, self.gateway_client, self.targe

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:42892)
Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-40-80b97830a0f2>", line 2, in <module>
    lr_model = lr_model.fit(train_data)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/base.py", line 64, in fit
    return self._fit(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 236, in _fit
    java_model = self._fit_java(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 233, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  Fil

--- Logging error ---
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:42892)
Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-40-80b97830a0f2>", line 2, in <module>
    lr_model = lr_model.fit(train_data)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/base.py", line 64, in fit
    return self._fit(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 236, in _fit
    java_model = self._fit_java(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 233, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__
    answer, self.gateway_client, self.targe

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:42892)
Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-40-80b97830a0f2>", line 2, in <module>
    lr_model = lr_model.fit(train_data)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/base.py", line 64, in fit
    return self._fit(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 236, in _fit
    java_model = self._fit_java(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 233, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  Fil

In [None]:
# And evaluate the model using the test data.
predictions = lr_model.transform(test_data)


# Evaluate the model using the binary classifer.
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                       labelCol='label')

accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))