In [1]:
import findspark
findspark.init()
import numpy as np
from pyspark.sql import SparkSession
from pyspark.ml.feature import FeatureHasher, StringIndexer, OneHotEncoder, VectorAssembler

spark = SparkSession.builder\
                    .appName("Assignment2_H3")\
                    .master("local[*]")\
                    .config('spark.executor.memory','64g')\
                    .config('spark.driver.memory','32g')\
                    .config('spark.driver.maxResultSize','2g')\
                    .config('spark.default.parallelism','300')\
                    .config('spark.network.timeout','500')\
                    .getOrCreate()


df = spark.read\
    .format("jdbc")\
    .option("url", "")\
    .option("dbtable", "citizen") \
    .option("user", "postgres")\
    .option("password", "postgres")\
    .option("driver", "org.postgresql.Driver") \
    .load()

In [2]:
# Preprocession: Convert the type of columns
from pyspark.sql.functions import to_date,year,month,dayofmonth
pattern = 'd/M/y'
df = df.withColumn('date_of_birth',to_date(df['date_of_birth'],pattern))
df = df.withColumn('national_identifier',df.national_identifier.cast('long'))
df = df.withColumn('door_or_entrance_number',df.door_or_entrance_number.cast('int'))
df = df.withColumn('year',year('date_of_birth'))
df = df.withColumn('month',month('date_of_birth'))
df = df.withColumn('day',dayofmonth('date_of_birth'))
df.printSchema()

root
 |-- uid: long (nullable = true)
 |-- national_identifier: long (nullable = true)
 |-- first: string (nullable = true)
 |-- last: string (nullable = true)
 |-- mother_first: string (nullable = true)
 |-- father_first: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- birth_city: string (nullable = true)
 |-- date_of_birth: date (nullable = true)
 |-- id_registration_city: string (nullable = true)
 |-- id_registration_district: string (nullable = true)
 |-- address_city: string (nullable = true)
 |-- address_district: string (nullable = true)
 |-- address_neighborhood: string (nullable = true)
 |-- street_address: string (nullable = true)
 |-- door_or_entrance_number: integer (nullable = true)
 |-- misc: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)



## H3. Surname prediction model
Given all the information about a person (except the surname), predict the most likely surname of that person. Analyze the prediction accuracy of this model Top1 to Top 5

In [3]:
df3p = df.limit(40000)

In [4]:
df3p.cache()

DataFrame[uid: bigint, national_identifier: bigint, first: string, last: string, mother_first: string, father_first: string, gender: string, birth_city: string, date_of_birth: date, id_registration_city: string, id_registration_district: string, address_city: string, address_district: string, address_neighborhood: string, street_address: string, door_or_entrance_number: int, misc: string, year: int, month: int, day: int]

In [None]:
# Index the label
label = 'last'
labelIdx = StringIndexer(inputCol=label, outputCol='last_int', handleInvalid='skip')
labelIdxModel = labelIdx.fit(df3p)
df3p = labelIdxModel.transform(df3p)

In [8]:
### df3p.rdd.getNumPartitions()
# 08:14

10

In [None]:
# Feature Extraction

# # Remove label column and invalid columns.
# features = df3p.columns
# features.remove('last_int')
# features.remove('misc')
# features.remove('date_of_birth')
features = ['mother_first','father_first']
for i in features:
    labelIdx = StringIndexer(inputCol=i, outputCol=i + '_int', handleInvalid='skip')
    labelIdxModel = labelIdx.fit(df3p)
    df3p = labelIdxModel.transform(df3p)

df3v = df3p
for i in features:
    encoder = OneHotEncoder(inputCol = i + '_int', outputCol = i + '_vec')
    df3v = encoder.fit(df3v).transform(df3v)

In [None]:
# VectorAssembler
vecAssembler = VectorAssembler(outputCol="features")
vecAssembler.setInputCols(['mother_first_int','father_first_int'])
df3v = vecAssembler.transform(df3v)

df3v.cache()

In [6]:
######
# Vectorize the feature
# hasher3 = FeatureHasher(numFeatures=3)
# hasher3.setInputCols(features).setOutputCol("features")
# df3v = hasher3.transform(df3p)
# df3v = df3v.select('features','last','last_int')

In [None]:
# Split the dataset
train_dat, valid_dat, test_dat = df3v.randomSplit([0.7, 0.1, 0.2],seed=1234)

In [None]:
# # Feature Selection
# from pyspark.ml.feature import ChiSqSelector
# slctor3 = ChiSqSelector(numTopFeatures = 2,featuresCol='features',outputCol='selectFeatures',labelCol='last_int')
# slctorModel3 = slctor3.fit(train_dat)

# selectIdx = slctorModel3.selectedFeatures
# selectIdx

In [None]:
from pyspark.ml.classification import NaiveBayes
nb3 = NaiveBayes(smoothing=1.0, modelType="multinomial")
nb3.setLabelCol('last_int').setFeaturesCol("features")
model3 = nb3.fit(train_dat)

In [None]:
res = model3.transform(valid_dat).select('last_int','probability').rdd

In [None]:
# Top1 to Top5 accuracy
def accuracy_per(level, prob, label):
    miss = 0
    a = 0
    label_prob = prob[label]
    for p in prob:
        if p > label_prob:
            a = a+1
        if a >= level:
            miss = 1
            break
    return miss

topK = []
for i in range(1,6):
    topK.append(res.map(lambda x:(1,accuracy_per(i, x[1].toArray() ,int(x[0]))))\
                        .reduce(lambda x,y:(x[0]+y[0],x[1]+y[1])))

In [None]:
print(topK)

In [None]:
for i in range(0,5):
    print('The top', i ,'accuracy is', 1 - topK[i][1]/topK[i][0], '\n')

In [None]:
# Parameter Tunning

# from pyspark.ml.tuning import ParamGridBuilder
# layers = [128,64,8,2]
# layers2 = [64,8,4,2]
# paramGrid2 = (ParamGridBuilder()
#             .addGrid(mlp2.layers, [layers,layers2])
#             .build())

# from pyspark.ml.evaluation import BinaryClassificationEvaluator
# evaluator2 = BinaryClassificationEvaluator(labelCol='gender_int',rawPredictionCol='prediction')

# from pyspark.ml.tuning import CrossValidator
# cv2 = CrossValidator(estimator = nb2,
#                     evaluator = evaluator2,
#                     estimatorParamMaps=paramGrid2,
#                     numFolds=3,
#                     parallelism=2,
#                     seed=1234)

# genderModel = cv2.fit(valid_dat2)

# evaluator2.evaluate(genderModel.transform(test_dat2))