In [1]:
import findspark
findspark.init()
import numpy as np
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, FeatureHasher

spark = SparkSession.builder\
                    .appName("Assignment2")\
                    .master("local")\
                    .config('spark.executor.memory','32g')\
                    .config('spark.driver.memory','32g')\
                    .config('spark.driver.maxResultSize','2g')\
                     .config('spark.default.parallelism','2000')\
                    .getOrCreate()



df = spark.read\
    .format("jdbc")\
    .option("url", "")\
    .option("dbtable", "citizen") \
    .option("user", "postgres")\
    .option("password", "postgres")\
    .option("driver", "org.postgresql.Driver") \
    .load()

In [3]:
# Preprocession: Convert the type of columns
from pyspark.sql.functions import to_date,year,month,dayofmonth
pattern = 'd/M/y'
df = df.withColumn('date_of_birth',to_date(df['date_of_birth'],pattern))
df = df.withColumn('national_identifier',df.national_identifier.cast('long'))
df = df.withColumn('door_or_entrance_number',df.door_or_entrance_number.cast('int'))
df = df.withColumn('year', year('date_of_birth'))
df = df.withColumn('month', month('date_of_birth'))
df = df.withColumn('day', dayofmonth('date_of_birth'))
df.printSchema()

root
 |-- uid: long (nullable = true)
 |-- national_identifier: long (nullable = true)
 |-- first: string (nullable = true)
 |-- last: string (nullable = true)
 |-- mother_first: string (nullable = true)
 |-- father_first: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- birth_city: string (nullable = true)
 |-- date_of_birth: date (nullable = true)
 |-- id_registration_city: string (nullable = true)
 |-- id_registration_district: string (nullable = true)
 |-- address_city: string (nullable = true)
 |-- address_district: string (nullable = true)
 |-- address_neighborhood: string (nullable = true)
 |-- street_address: string (nullable = true)
 |-- door_or_entrance_number: integer (nullable = true)
 |-- misc: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)



In [6]:
#df.persist()

DataFrame[uid: bigint, national_identifier: bigint, first: string, last: string, mother_first: string, father_first: string, gender: string, birth_city: string, date_of_birth: date, id_registration_city: string, id_registration_district: string, address_city: string, address_district: string, address_neighborhood: string, street_address: string, door_or_entrance_number: int, misc: string]


## N6. Calculate the population density of the top 10 most populous cities, using square kilometers for the area

In [5]:
# Compute the top-10  city
top10 = df.groupBy('address_city').count().sort("count",ascending=False).limit(10)
top10.rdd.keys().collect()

['ISTANBUL',
 'ANKARA',
 'IZMIR',
 'BURSA',
 'AYDIN',
 'ADANA',
 'KONYA',
 'ANTALYA',
 'MERSIN',
 'KOCAELI']

In [6]:
# Results from DB-city.com：
# 'ISTANBUL', city/province 5343 
#  'ANKARA', city 2516 province 17501
#  'IZMIR', city 6412.92 province 12473
#  'BURSA', city 2259.35 province 10422
#  'AYDIN', city 627 province 8187
#  'ADANA', city 3708.19 province 13834
#  'KONYA', city 6617 province 39390
#  'ANTALYA', city 2099 province 22111
#  'MERSIN', city 3038 province 17661
#  'KOCAELI', city/province 3676.20 

# Corresponding areas
city_area = {
 'ISTANBUL':5343,
 'ANKARA':17501,
 'IZMIR':12473,
 'BURSA':10422,
 'AYDIN':8187,
 'ADANA':13834,
 'KONYA':39390,
 'ANTALYA':22111,
 'MERSIN':17661,
 'KOCAELI':3676.20,
}

In [7]:
# Compute the density of the population
def density(kv):
    return (kv[0], kv[1]/city_area[kv[0]])
    
ans = top10.select('address_city','count').rdd
ans.map(density).collect()

[('ISTANBUL', 1652.5517499532098),
 ('ANKARA', 176.07239586309353),
 ('IZMIR', 223.79531788663513),
 ('BURSA', 171.22155056611015),
 ('AYDIN', 172.42335409796019),
 ('ADANA', 100.51301142113633),
 ('KONYA', 33.82178217821782),
 ('ANTALYA', 58.327845868572204),
 ('MERSIN', 62.22541192457958),
 ('KOCAELI', 277.0678961971601)]

## N7. The proportion of cross-administrative mobility and cross-urban mobility in Turkey to the total population according to the place of origin and residence of the population, respectively

In [8]:
# Cross-city mobility: One whose id_registration_city and address_city are different
# Cross-district mobiilty: One whose id_registration_district and address_district are different 

In [4]:
df7 = df.select('id_registration_city','id_registration_district','address_city','address_district')

In [5]:
def crossMan(x):
    for i in range(0,4):
        if x[i] is None:
            return _,(0,0,0)
    crossCity = 0
    crossDist = 0
    # Compare id_registration_city and address_city
    if x[0] != x[2]:
        crossCity = 1
    if x[1] != x[3]:
        crossDist = 1
    return _,(1, crossCity, crossDist)

In [6]:
ans = df7.rdd.map(crossMan).reduceByKey(lambda x,y: (x[0]+y[0], x[1]+y[1],  x[2]+y[2])).mapValues(lambda x: (x[1]/x[0], x[2]/x[0]))                   

In [8]:
ans.values().collect()

[(0.36136473347451104, 0.523862199546482)]

## H1. Prediction model for the living city
Given all the information about a person (except the city), predict the city where the person is located. Analyze the prediction accuracy of this model Top1 to Top 5.

In [4]:
df.printSchema()

root
 |-- uid: long (nullable = true)
 |-- national_identifier: long (nullable = true)
 |-- first: string (nullable = true)
 |-- last: string (nullable = true)
 |-- mother_first: string (nullable = true)
 |-- father_first: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- birth_city: string (nullable = true)
 |-- date_of_birth: date (nullable = true)
 |-- id_registration_city: string (nullable = true)
 |-- id_registration_district: string (nullable = true)
 |-- address_city: string (nullable = true)
 |-- address_district: string (nullable = true)
 |-- address_neighborhood: string (nullable = true)
 |-- street_address: string (nullable = true)
 |-- door_or_entrance_number: integer (nullable = true)
 |-- misc: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)



In [8]:
# Feature Extraction

features = ['address_district', 'id_registration_city', 'id_registration_district']
label = 'address_city'

df1 = df.select(features + [label])

for i in features:
    inCol = i
    outCol = i + '_int'
    strIdx = StringIndexer(inputCol=inCol, outputCol=outCol, handleInvalid='skip')
    model = strIdx.fit(df1)
    df1 = model.transform(df1)

In [10]:
cityIdx = StringIndexer(inputCol=label, outputCol='address_city_int', handleInvalid='skip')
cityIdxModel = cityIdx.fit(df1)
df1int = cityIdxModel.transform(df1)

In [12]:
df1vec = df1int
for i in features:
    encoder = OneHotEncoder(inputCol=i + '_int',outputCol=i + '_vec')
    df1vec = encoder.fit(df1vec).transform(df1vec)

In [14]:
# VectorAssembler
vecAssembler = VectorAssembler(outputCol="features")
vecAssembler.setInputCols(["address_district_vec", "id_registration_city_vec", "id_registration_district_vec"])
df1v = vecAssembler.transform(df1vec)

df1v.cache()

DataFrame[address_district: string, id_registration_city: string, id_registration_district: string, address_city: string, address_district_int: double, id_registration_city_int: double, id_registration_district_int: double, address_city_int: double, address_district_vec: vector, id_registration_city_vec: vector, id_registration_district_vec: vector, features: vector]

In [15]:
# Split the dataset
train_dat, valid_dat, test_dat = df1v.randomSplit([0.7, 0.1, 0.2],seed=1234)

In [16]:
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")
nb.setLabelCol('address_city_int').setFeaturesCol("features")
model = nb.fit(train_dat)

In [17]:
result = model.transform(valid_dat)
result.head().probability

DenseVector([0.0001, 0.0001, 0.0, 0.0, 0.0, 0.9909, 0.0, 0.0003, 0.002, 0.0, 0.0, 0.0001, 0.0003, 0.0, 0.0, 0.0, 0.0001, 0.0, 0.0, 0.0, 0.0001, 0.0, 0.0001, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0006, 0.0, 0.0, 0.0, 0.0045, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0002, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0001, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])

In [18]:
accurRdd = result.select('address_city_int','probability','prediction').rdd

In [25]:
accurRdd.first()

Row(address_city_int=46.0, probability=DenseVector([0.0001, 0.0001, 0.0, 0.0, 0.0, 0.9909, 0.0, 0.0003, 0.002, 0.0, 0.0, 0.0001, 0.0003, 0.0, 0.0, 0.0, 0.0001, 0.0, 0.0, 0.0, 0.0001, 0.0, 0.0001, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0006, 0.0, 0.0, 0.0, 0.0045, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0002, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0001, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]), prediction=5.0)

In [22]:
# Top1 to Top5 accuracy
def accuracy_per(level, prob, label):
    miss = 0
    a = 0
    label_prob = prob[label]
    for p in prob:
        if p > label_prob:
            a = a+1
        if a >= level:
            miss = 1
            break
    return miss

topK = []
for i in range(1,6):
    topK.append(accurRdd.map(lambda x:(1,accuracy_per(i, x[1].toArray() ,int(x[0]))))\
                        .reduce(lambda x,y:(x[0]+y[0],x[1]+y[1])))

In [23]:
topK

[(4960172, 200685),
 (4960172, 5313),
 (4960172, 1405),
 (4960172, 226),
 (4960172, 88)]

In [28]:
for i in range(0,5):
    print('The top', i ,'accuracy is', 1 - topK[i][1]/topK[i][0], '\n')

The top 0 accuracy is 0.9595407175396337 

The top 1 accuracy is 0.9989288677892622 

The top 2 accuracy is 0.9997167436935654 

The top 3 accuracy is 0.999954437063876 

The top 4 accuracy is 0.9999822586797393 

