In [1]:
import findspark
findspark.init()
import numpy as np
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler

spark = SparkSession.builder\
                    .appName("Assignment2_H4")\
                    .master("local")\
                    .config('spark.executor.memory','32g')\
                    .config('spark.driver.memory','32g')\
                    .config('spark.driver.maxResultSize','2g')\
                    .config('spark.executor.cores','2000')\

                    .getOrCreate()


df = spark.read\
    .format("jdbc")\
    .option("url", "jdbc:postgresql://10.219.156.123:5432/mernis")\
    .option("dbtable", "citizen") \
    .option("user", "postgres")\
    .option("password", "postgres")\
    .option("driver", "org.postgresql.Driver") \
    .load()

In [2]:
# Preprocession: Convert the type of columns
from pyspark.sql.functions import to_date
pattern = 'd/M/y'
df = df.withColumn('date_of_birth',to_date(df['date_of_birth'],pattern))
df = df.withColumn('national_identifier',df.national_identifier.cast('long'))
df = df.withColumn('door_or_entrance_number',df.door_or_entrance_number.cast('int'))
df.printSchema()

root
 |-- uid: long (nullable = true)
 |-- national_identifier: long (nullable = true)
 |-- first: string (nullable = true)
 |-- last: string (nullable = true)
 |-- mother_first: string (nullable = true)
 |-- father_first: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- birth_city: string (nullable = true)
 |-- date_of_birth: date (nullable = true)
 |-- id_registration_city: string (nullable = true)
 |-- id_registration_district: string (nullable = true)
 |-- address_city: string (nullable = true)
 |-- address_district: string (nullable = true)
 |-- address_neighborhood: string (nullable = true)
 |-- street_address: string (nullable = true)
 |-- door_or_entrance_number: integer (nullable = true)
 |-- misc: string (nullable = true)



## H4. Population Prediction Model
Calculate the number of births in each year and predicts the number of new population in the following year.

In [4]:
# Count the new-born population of each year
from pyspark.sql.functions import year

df4 = df.select(year('date_of_birth').alias('year')).groupBy('year').count()
df4 = df4.filter(df4['year'].isNotNull())
df4 = df4.orderBy('year', ascending = False)
maxyear = df4.first()

In [5]:
target = maxyear['year'] + 1

In [6]:
# Visualize our data
from pyecharts import options as opts
from pyecharts.charts import Line
xline = df4.rdd.keys().collect()
yline = df4.rdd.values().collect()
img = (
    Line()
    .add_xaxis(xline)
    .add_yaxis("Population",yline)  
    .set_global_opts(title_opts=opts.TitleOpts(title="New-born population each year"),
                     xaxis_opts=opts.AxisOpts(type_='value', is_scale=True))
    .set_series_opts(label_opts=opts.LabelOpts(is_show=False))
)
img.render_notebook()

In [7]:
###xline

In [8]:
# Truncate the valid data
from pyecharts import options as opts
from pyecharts.charts import Line
img = (
    Line()
    .add_xaxis(xline[1:-25])
    .add_yaxis("Population",yline[1:-25])  
    .set_global_opts(title_opts=opts.TitleOpts(title="New-born population each year"),
                     xaxis_opts=opts.AxisOpts(type_='value', is_scale=True))
    .set_series_opts(label_opts=opts.LabelOpts(is_show=False))
)
img.render_notebook()

In [10]:
####df4.count()

In [11]:
df4p = df4.filter("year>1900")
# Vectorization
from pyspark.ml.feature import VectorAssembler
vecAssembler4 = VectorAssembler(inputCols=['year'], outputCol='features', handleInvalid='skip')

In [12]:
from pyspark.ml.regression import GeneralizedLinearRegression

glr = GeneralizedLinearRegression(featuresCol='features',labelCol='count',family="gaussian", predictionCol="p")

In [13]:
# Evaluation
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(labelCol='count',predictionCol='p',metricName="rmse")

In [14]:
# Parameter Tunning
from pyspark.ml.tuning import ParamGridBuilder
paramGrid = (ParamGridBuilder()
            .addGrid(glr.regParam, [0.01,0.1])
            .addGrid(glr.link, ["identity","log"])
            .addGrid(glr.maxIter, [5, 10])
            .build())

In [15]:
from pyspark.ml.tuning import CrossValidator
cv = CrossValidator(estimator = glr,
                    evaluator = evaluator,
                    estimatorParamMaps=paramGrid,
                    numFolds=3,
                    seed=1234)

In [16]:
# Build the pipeline
from pyspark.ml import Pipeline
pipline = Pipeline(stages=[vecAssembler4, cv])

In [17]:
populationModel = pipline.fit(df4p)

In [114]:
####populationModel.stages

[VectorAssembler_92cea0826bb1, CrossValidatorModel_c8306ff3e575]

In [123]:
# Check the optimal parameters
populationModel.stages[1].bestModel.extractParamMap()

{Param(parent='GeneralizedLinearRegression_5a72bcd6da72', name='aggregationDepth', doc='suggested depth for treeAggregate (>= 2).'): 2,
 Param(parent='GeneralizedLinearRegression_5a72bcd6da72', name='family', doc='The name of family which is a description of the error distribution to be used in the model. Supported options: gaussian (default), binomial, poisson, gamma and tweedie.'): 'gaussian',
 Param(parent='GeneralizedLinearRegression_5a72bcd6da72', name='featuresCol', doc='features column name.'): 'features',
 Param(parent='GeneralizedLinearRegression_5a72bcd6da72', name='fitIntercept', doc='whether to fit an intercept term.'): True,
 Param(parent='GeneralizedLinearRegression_5a72bcd6da72', name='labelCol', doc='label column name.'): 'count',
 Param(parent='GeneralizedLinearRegression_5a72bcd6da72', name='maxIter', doc='max number of iterations (>= 0).'): 5,
 Param(parent='GeneralizedLinearRegression_5a72bcd6da72', name='predictionCol', doc='prediction column name.'): 'p',
 Param(p

In [None]:
# # Test the model
# img = (
#     Line()
#     .add_xaxis(pxline)
#     .add_yaxis("Population",pline)  
#     .set_global_opts(title_opts=opts.TitleOpts(title="Predicted new-born population each year"),
#                      xaxis_opts=opts.AxisOpts(type_='value', is_scale=True))
#     .set_series_opts(label_opts=opts.LabelOpts(is_show=False))
# )
# img.render_notebook()

In [62]:
targetDf = spark.createDataFrame([(target, 0)], df4p.schema)
res = populationModel.transform(targetDf)

In [64]:
res.show()

+----+-----+--------+------------------+
|year|count|features|                 p|
+----+-----+--------+------------------+
|1992|    0|[1992.0]|1327131.3438684456|
+----+-----+--------+------------------+

