In [1]:
# Initializing a Spark session
from pyspark.sql import SparkSession
import time
spark = SparkSession.builder.master("local").appName("income").config("spark.some.config.option","some-value").getOrCreate()

In [2]:
start_time = time.time()
raw_data = spark.read.csv('s3://516ml/adult-training.csv',
                    header='true', inferSchema='true')
print("--- %s seconds ---" % (time.time() - start_time))

--- 16.2218940258 seconds ---


In [3]:
import numpy as np
from pyspark.sql.functions import when
raw_data=raw_data.withColumn("Income",when(raw_data.Income=='null',np.nan).otherwise(raw_data.Income))
raw_data=raw_data.withColumn("Income",when(raw_data.Income==' <=50K',0.0).otherwise(1.0))

raw_data=raw_data.withColumn("Sex",when(raw_data.Sex=='null',np.nan).otherwise(raw_data.Sex))
raw_data=raw_data.withColumn("Sex",when(raw_data.Sex=='Female',1.0).otherwise(0.0))
raw_data=raw_data.withColumn("Age",raw_data.Age.cast('float'))

In [4]:
from pyspark.ml.feature import Imputer
imputer=Imputer(inputCols=["Age","fnlgwt","Education num", "Sex", "Hours/Week", "Income"],outputCols=["Age","fnlgwt","Education num", "Sex", "Hours/Week", "Income"])
model=imputer.fit(raw_data)
raw_data=model.transform(raw_data)

In [5]:
cols = ["Age","fnlgwt","Education num", "Sex", "Hours/Week", "Income"]
cols.remove("Income")
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=cols,outputCol="features")
raw_data=assembler.transform(raw_data)

# Standard Sclarizer 

So we have created a feature vector. Now let us use StandardScaler to scalerize the newly created "feature" column 

In [6]:
from pyspark.ml.feature import StandardScaler
standardscaler=StandardScaler().setInputCol("features").setOutputCol("Scaled_features")
raw_data=standardscaler.fit(raw_data).transform(raw_data)

# Train, test split

Now that the preprocessing of the data is complete. Let us split the dataset in training and testing set. 

In [7]:
train, test = raw_data.randomSplit([0.8, 0.2], seed=12345)

Let us check whether their is imbalance in the dataset

In [8]:
dataset_size=float(train.select("Income").count())
numPositives=train.select("Income").where('Income == 1').count()
per_ones=(float(numPositives)/float(dataset_size))*100
numNegatives=float(dataset_size-numPositives)

In [9]:
BalancingRatio= numNegatives/dataset_size

In [10]:
train=train.withColumn("classWeights", when(train.Income == 1,BalancingRatio).otherwise(1-BalancingRatio))

In [11]:
from pyspark.ml.classification import LogisticRegression
start_time = time.time()
lr = LogisticRegression(labelCol="Income", featuresCol="Scaled_features",weightCol="classWeights",maxIter=10)
model = lr.fit(train)    
print("--- %s seconds ---" % (time.time() - start_time))
predict_train=model.transform(train)
predict_test=model.transform(test)

--- 7.94853496552 seconds ---


In [12]:
# The BinaryClassificationEvaluator uses areaUnderROC as the default metric. As o fnow we will continue with the same
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator=BinaryClassificationEvaluator(rawPredictionCol="rawPrediction",labelCol="Income")

In [13]:
predict_test.select("Income","rawPrediction","prediction","probability").show(5)

+------+--------------------+----------+--------------------+
|Income|       rawPrediction|prediction|         probability|
+------+--------------------+----------+--------------------+
|   0.0|[3.25412979493759...|       0.0|[0.96282122779656...|
|   0.0|[3.07498513336678...|       0.0|[0.95584903204928...|
|   0.0|[3.75138778546630...|       0.0|[0.97705376443933...|
|   0.0|[3.04977377641950...|       0.0|[0.95477275879431...|
|   0.0|[3.83494964851657...|       0.0|[0.97885437073768...|
+------+--------------------+----------+--------------------+
only showing top 5 rows



In [14]:
print("The area under ROC for train set is {}".format(evaluator.evaluate(predict_train)))
print("The area under ROC for test set is {}".format(evaluator.evaluate(predict_test)))

The area under ROC for train set is 0.802233893908
The area under ROC for test set is 0.790478006968


In [15]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

paramGrid = ParamGridBuilder()\
    .addGrid(lr.aggregationDepth,[2,5,10])\
    .addGrid(lr.elasticNetParam,[0.0, 0.5, 1.0])\
    .addGrid(lr.fitIntercept,[False, True])\
    .addGrid(lr.maxIter,[10, 100, 1000])\
    .addGrid(lr.regParam,[0.01, 0.5, 2.0]) \
    .build()

# https://spark.apache.org/docs/2.1.0/ml-tuning.html

# K-fold cross validation

In [16]:
# Create 5-fold CrossValidator
cv = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
start_time = time.time()
# Run cross validations
cvModel = cv.fit(train)
# this will likely take a fair amount of time because of the amount of models that we're creating and testing
print("--- %s seconds ---" % (time.time() - start_time))

# predict_train=cvModel.transform(train)
# predict_test=cvModel.transform(test)

# print("The area under ROC for train set after CV  is {}".format(evaluator.evaluate(predict_train)))
# print("The area under ROC for test set after CV  is {}".format(evaluator.evaluate(predict_test)))

--- 1388.87217283 seconds ---


In [17]:
print((raw_data.count(), len(raw_data.columns)))

(32561, 17)
