In [1]:
import findspark
findspark.init()
import numpy as np
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, FeatureHasher

spark = SparkSession.builder\
                    .appName("Assignment2_H2")\
                    .master("local[*]")\
                    .config('spark.executor.memory','32g')\
                    .config('spark.driver.memory','64g')\
                    .config('spark.driver.maxResultSize','2g')\
                    .config('spark.default.parallelism','300')\
                    .config('spark.network.timeout','500')\
                    .getOrCreate()



df = spark.read\
    .format("jdbc")\
    .option("url", "")\
    .option("dbtable", "citizen") \
    .option("user", "postgres")\
    .option("password", "postgres")\
    .option("driver", "org.postgresql.Driver") \
    .load()

In [3]:
# Select a subset of the data
df = df.limit(40000)

In [4]:
# Preprocession: Convert the type of columns
from pyspark.sql.functions import to_date,year,month,dayofmonth
pattern = 'd/M/y'
df = df.withColumn('date_of_birth',to_date(df['date_of_birth'],pattern))
df = df.withColumn('national_identifier',df.national_identifier.cast('long'))
df = df.withColumn('door_or_entrance_number',df.door_or_entrance_number.cast('int'))
df = df.withColumn('year', year('date_of_birth'))
df = df.withColumn('month', month('date_of_birth'))
df = df.withColumn('day', dayofmonth('date_of_birth'))
df.printSchema()

root
 |-- uid: long (nullable = true)
 |-- national_identifier: long (nullable = true)
 |-- first: string (nullable = true)
 |-- last: string (nullable = true)
 |-- mother_first: string (nullable = true)
 |-- father_first: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- birth_city: string (nullable = true)
 |-- date_of_birth: date (nullable = true)
 |-- id_registration_city: string (nullable = true)
 |-- id_registration_district: string (nullable = true)
 |-- address_city: string (nullable = true)
 |-- address_district: string (nullable = true)
 |-- address_neighborhood: string (nullable = true)
 |-- street_address: string (nullable = true)
 |-- door_or_entrance_number: integer (nullable = true)
 |-- misc: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)



## H2. Gender prediction model

In [9]:
# Index the label
label = 'gender'
genderIdx = StringIndexer(inputCol=label, outputCol='gender_int', handleInvalid='skip')
genderIdxModel = genderIdx.fit(df)
df2 = genderIdxModel.transform(df) 

features2 = 'first'

strIdx = StringIndexer(inputCol=features2, outputCol=features2 + '_int', handleInvalid='skip')
tmpModel = strIdx.fit(df2)
df2 = tmpModel.transform(df2) 

encoder2 = OneHotEncoder(inputCol=features2 + '_int',outputCol=features2 + '_vec')
df2v = encoder2.fit(df2).transform(df2)

In [12]:
df2v = df2v.select('first_vec','gender_int','gender')

In [14]:
# Split the dataset
train_dat2, valid_dat2, test_dat2 = df2v.randomSplit([0.7, 0.1, 0.2],seed=1234)

In [16]:
from pyspark.ml.classification import NaiveBayes
nb2 = NaiveBayes(smoothing=1.0, modelType="bernoulli")
nb2.setLabelCol('gender_int').setFeaturesCol("first_vec")
model2 = nb2.fit(train_dat2)

In [17]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator2 = BinaryClassificationEvaluator(labelCol='gender_int',rawPredictionCol='prediction')
evaluator2.evaluate(model2.transform(valid_dat2))

0.9710597080287592

In [18]:
# Parameter Tunning
from pyspark.ml.tuning import ParamGridBuilder
paramGrid2 = (ParamGridBuilder()
            .addGrid(nb2.smoothing, [0.01,2.0])
            .build())

from pyspark.ml.tuning import CrossValidator
cv2 = CrossValidator(estimator = nb2,
                    evaluator = evaluator2,
                    estimatorParamMaps=paramGrid2,
                    numFolds=3,
                    parallelism=200,
                    seed=1234)

genderModel = cv2.fit(valid_dat2)

evaluator2.evaluate(genderModel.transform(test_dat2))

0.9283440130189823

In [23]:
evaluator2.evaluate(model2.transform(test_dat2))

0.9662645643913371