In [1]:
# Initializing a Spark session
from pyspark.sql import SparkSession
import time
spark = SparkSession.builder.master("local").appName("poker").config("spark.some.config.option","some-value").getOrCreate()

In [2]:
# create a Spark dataframe named 'raw_data'
start_time = time.time()
raw_data = spark.read.csv('s3://516ml/poker_hand.csv',
                    header='true', inferSchema='true')
print("--- %s seconds ---" % (time.time() - start_time))


--- 11.7803490162 seconds ---


In [3]:
raw_data.show(5)

+---+---+---+---+---+---+---+---+---+---+-----+
| s1| c1| s2| c2| s3| c3| s4| c4| s5| c5|Class|
+---+---+---+---+---+---+---+---+---+---+-----+
|  4|  7|  3|  5|  3|  3|  1| 13|  4|  8|    0|
|  2|  8|  4|  9|  4|  6|  4|  1|  3|  7|    0|
|  3|  6|  1|  3|  2| 11|  3|  9|  2|  3|    1|
|  2| 10|  2|  5|  4| 13|  3|  9|  1|  6|    0|
|  3|  2|  1|  3|  4|  7|  3|  5|  1| 11|    0|
+---+---+---+---+---+---+---+---+---+---+-----+
only showing top 5 rows



In [4]:
import numpy as np
from pyspark.sql.functions import when
raw_data=raw_data.withColumn("Class",when(raw_data.Class == 0,0.0).otherwise(1.0))
cols = ["s1","c1","s2", "c2", "s3", "c3", "s4", "c4", "s5", "c5", "Class"]

from pyspark.sql import functions as F

for col in raw_data.columns:
     raw_data= raw_data.withColumn(col,F.col(col).cast("float"))

In [5]:
from pyspark.ml.feature import Imputer
cols = ["s1","c1","s2", "c2", "s3", "c3", "s4", "c4", "s5", "c5", "Class"]

imputer=Imputer(inputCols=cols,outputCols=cols)
model=imputer.fit(raw_data)
raw_data=model.transform(raw_data)

In [6]:
cols.remove("Class")
# import the vector assembler
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=cols,outputCol="features")
# use the transform method to transform our dataset
raw_data=assembler.transform(raw_data)

In [7]:
from pyspark.ml.feature import StandardScaler
standardscaler=StandardScaler().setInputCol("features").setOutputCol("Scaled_features")
raw_data=standardscaler.fit(raw_data).transform(raw_data)

In [8]:
train, test = raw_data.randomSplit([0.8, 0.2], seed=12345)

In [9]:
dataset_size=float(train.select("Class").count())
numPositives=train.select("Class").where('Class == 1').count()
per_ones=(float(numPositives)/float(dataset_size))*100
numNegatives=float(dataset_size-numPositives)
print('The number of ones are {}'.format(numPositives))
print('Percentage of ones are {}'.format(per_ones))

The number of ones are 319351
Percentage of ones are 49.9152841731


In [10]:
BalancingRatio= numNegatives/dataset_size
print('BalancingRatio = {}'.format(BalancingRatio))

BalancingRatio = 0.500847158269


In [11]:
from pyspark.sql.functions import when
train=train.withColumn("classWeights", when(train.Class == 1,BalancingRatio).otherwise(1-BalancingRatio))

In [12]:
from pyspark.ml.classification import LogisticRegression
start_time = time.time()
lr = LogisticRegression(labelCol="Class", featuresCol="Scaled_features",weightCol="classWeights",maxIter=10)
model = lr.fit(train)    

print("--- %s seconds ---" % (time.time() - start_time))
predict_train=model.transform(train)
predict_test=model.transform(test)

--- 7.77806210518 seconds ---


In [13]:
# The BinaryClassificationEvaluator uses areaUnderROC as the default metric. As of now we will continue with the same
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator=BinaryClassificationEvaluator(rawPredictionCol="rawPrediction",labelCol="Class")

In [14]:
predict_test.select("Class","rawPrediction","prediction","probability").show(5)

+-----+--------------------+----------+--------------------+
|Class|       rawPrediction|prediction|         probability|
+-----+--------------------+----------+--------------------+
|  0.0|[-0.0187502542082...|       1.0|[0.49531257377779...|
|  0.0|[-0.0059606745892...|       1.0|[0.49850983576476...|
|  0.0|[-0.0200943090666...|       1.0|[0.49497659176204...|
|  0.0|[-0.0133344781022...|       1.0|[0.49666642986900...|
|  0.0|[-0.0125512778033...|       1.0|[0.49686222174143...|
+-----+--------------------+----------+--------------------+
only showing top 5 rows



In [15]:
print("The area under ROC for train set is {}".format(evaluator.evaluate(predict_train)))
print("The area under ROC for test set is {}".format(evaluator.evaluate(predict_test)))

The area under ROC for train set is 0.502700193809
The area under ROC for test set is 0.498407331124


In [16]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

paramGrid = ParamGridBuilder()\
    .addGrid(lr.aggregationDepth,[2,5,10])\
    .addGrid(lr.elasticNetParam,[0.0, 0.5, 1.0])\
    .addGrid(lr.fitIntercept,[False, True])\
    .addGrid(lr.maxIter,[10, 100, 1000])\
    .addGrid(lr.regParam,[0.01, 0.5, 2.0]) \
    .build()

# https://spark.apache.org/docs/2.1.0/ml-tuning.html

# K-fold cross validation

In [17]:
# Create 5-fold CrossValidator
cv = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
start_time = time.time()
# Run cross validations
cvModel = cv.fit(train)
# this will likely take a fair amount of time because of the amount of models that we're creating and testing

print("--- %s seconds ---" % (time.time() - start_time))
# predict_train=cvModel.transform(train)
# predict_test=cvModel.transform(test)
# print("The area under ROC for train set after CV  is {}".format(evaluator.evaluate(predict_train)))
# print("The area under ROC for test set after CV  is {}".format(evaluator.evaluate(predict_test)))

--- 940.947670937 seconds ---


In [18]:
print((raw_data.count(), len(raw_data.columns)))

(800000, 13)
