In [2]:
import json
from pyspark import SparkConf, SparkContext

In [3]:
sc = SparkContext(
    conf = SparkConf()
    .setMaster('local[*]')
    .setAppName('First Spark')
    .set('spark.driver.port','36004')
)

In [4]:
cities = sc.textFile('cities.json')

In [6]:
def custom_mapper(x):
    try:
        return json.loads(x) #convert json obj to python
    except:
        return None
    
cities_mapped = cities.map(lambda x: custom_mapper(x)).filter(lambda x: x is not None)
cities_mapped.collect()

[{'continent': 'Europe',
  'country': 'Russia',
  'name': 'Moscow',
  'population': 12380664},
 {'country': 'Spain', 'name': 'Madrid'},
 {'continent': 'Europe',
  'country': 'France',
  'name': 'Paris',
  'population': 2196936},
 {'continent': 'Europe',
  'country': 'Germany',
  'name': 'Berlin',
  'population': 3490105},
 {'continent': 'Europe', 'country': 'Spain', 'name': 'Barselona'},
 {'continent': 'Africa',
  'country': 'Egypt',
  'name': 'Cairo',
  'population': 11922948},
 {'continent': 'Africa',
  'country': 'Egypt',
  'name': 'Cairo',
  'population': 11922948}]

In [7]:
cities.getNumPartitions()

2

In [16]:
# -------- Get Keys
cities_keys = cities_mapped.flatMap(lambda x: x.keys()).distinct().collect()
cities_keys

['name', 'country', 'continent', 'population']

In [17]:
# -------- Get Values
cities_values = cities_mapped.flatMap(lambda x: x.values()).distinct().collect()
cities_values

[12380664,
 'France',
 2196936,
 'Barselona',
 11922948,
 'Moscow',
 'Russia',
 'Europe',
 'Madrid',
 'Spain',
 'Paris',
 'Berlin',
 'Germany',
 3490105,
 'Cairo',
 'Egypt',
 'Africa']

In [18]:
# -------- Remove duplicates

from pyspark.sql import SparkSession

spark = SparkSession(sc)
hasattr(cities,'toDF')

cities_df = cities_mapped.toDF()
cities_df.show()

cities_df_dup = cities_df.drop_duplicates(['continent','country','name'])
cities_df_dup.show()



+---------+-------+---------+----------+
|continent|country|     name|population|
+---------+-------+---------+----------+
|   Europe| Russia|   Moscow|  12380664|
|     null|  Spain|   Madrid|      null|
|   Europe| France|    Paris|   2196936|
|   Europe|Germany|   Berlin|   3490105|
|   Europe|  Spain|Barselona|      null|
|   Africa|  Egypt|    Cairo|  11922948|
|   Africa|  Egypt|    Cairo|  11922948|
+---------+-------+---------+----------+

+---------+-------+---------+----------+
|continent|country|     name|population|
+---------+-------+---------+----------+
|   Europe| France|    Paris|   2196936|
|     null|  Spain|   Madrid|      null|
|   Europe|  Spain|Barselona|      null|
|   Europe|Germany|   Berlin|   3490105|
|   Europe| Russia|   Moscow|  12380664|
|   Africa|  Egypt|    Cairo|  11922948|
+---------+-------+---------+----------+



In [20]:
# ------- Remove all empty or corrupted elements

cities_df_rem_null = cities_df.dropna(how='any')
cities_df_rem_null.show()

+---------+-------+------+----------+
|continent|country|  name|population|
+---------+-------+------+----------+
|   Europe| Russia|Moscow|  12380664|
|   Europe| France| Paris|   2196936|
|   Europe|Germany|Berlin|   3490105|
|   Africa|  Egypt| Cairo|  11922948|
|   Africa|  Egypt| Cairo|  11922948|
+---------+-------+------+----------+



In [73]:
#  ------- Find the most populous city

for i in range(1):
    print(cities_df.sort('population', ascending=False)
                   .take(1)[i]
                   .asDict()
                   .get('name')
         )

Moscow


In [75]:
# -------- Find TOP-ǽ most populous continents

for i in range(2):
    print(cities_df.sort('population', ascending=False)
                   .take(2)[i]
                   .asDict()
                   .get('continent')
         )

Europe
Africa


In [94]:
# -------- Add additional field `populous` to each record

import pyspark.sql.functions as F
from pyspark.sql.types import *

def check_set(continent):
    if continent == 'Africa' or continent == 'Europe':
        return 'true'
    else:
        return 'false'
    
populous_function = F.udf(check_set, StringType())
new_cities_df = cities_df.withColumn('populous', 
                                     populous_function("continent"))

new_cities_df.show()

+---------+-------+---------+----------+--------+
|continent|country|     name|population|populous|
+---------+-------+---------+----------+--------+
|   Europe| Russia|   Moscow|  12380664|    true|
|     null|  Spain|   Madrid|      null|   false|
|   Europe| France|    Paris|   2196936|    true|
|   Europe|Germany|   Berlin|   3490105|    true|
|   Europe|  Spain|Barselona|      null|    true|
|   Africa|  Egypt|    Cairo|  11922948|    true|
|   Africa|  Egypt|    Cairo|  11922948|    true|
+---------+-------+---------+----------+--------+



In [107]:
# Change RDD data partitioning over the field 'continent'

cities_df_repartitioned = cities_df.repartition('continent')

cities_df_repartitioned.show()

+---------+-------+---------+----------+
|continent|country|     name|population|
+---------+-------+---------+----------+
|   Europe| Russia|   Moscow|  12380664|
|   Europe| France|    Paris|   2196936|
|   Europe|Germany|   Berlin|   3490105|
|   Europe|  Spain|Barselona|      null|
|   Africa|  Egypt|    Cairo|  11922948|
|   Africa|  Egypt|    Cairo|  11922948|
|     null|  Spain|   Madrid|      null|
+---------+-------+---------+----------+



In [109]:
def print_partitions(df):
    numPartitions = df.rdd.getNumPartitions()
    print("Total partitions: {}".format(numPartitions))
    print("Partitioner: {}".format(df.rdd.partitioner))
    df.explain()
    parts = df.rdd.glom().collect()
    i = 0
    j = 0
    for p in parts:
        print("Partition {}:".format(i))
        for r in p:
            print("Row {}:{}".format(j, r))
            j = j+1
        i = i+1
        
print_partitions(cities_df_repartitioned)

Total partitions: 200
Partitioner: None
== Physical Plan ==
Exchange hashpartitioning(continent#55, 200)
+- Scan ExistingRDD[continent#55,country#56,name#57,population#58L]
Partition 0:
Partition 1:
Partition 2:
Partition 3:
Partition 4:
Partition 5:
Partition 6:
Partition 7:
Partition 8:
Partition 9:
Partition 10:
Partition 11:
Partition 12:
Partition 13:
Partition 14:
Partition 15:
Partition 16:
Partition 17:
Partition 18:
Partition 19:
Partition 20:
Partition 21:
Partition 22:
Row 0:Row(continent='Europe', country='Russia', name='Moscow', population=12380664)
Row 1:Row(continent='Europe', country='France', name='Paris', population=2196936)
Row 2:Row(continent='Europe', country='Germany', name='Berlin', population=3490105)
Row 3:Row(continent='Europe', country='Spain', name='Barselona', population=None)
Partition 23:
Partition 24:
Partition 25:
Partition 26:
Partition 27:
Partition 28:
Partition 29:
Partition 30:
Partition 31:
Partition 32:
Partition 33:
Partition 34:
Partition 35:
P

In [113]:
# ----- Write custom partitioner, 
# ---- so data for different continents will go to different partitions.

# ##### This bit doesn't work

from pyspark.sql.functions import udf
from pyspark.rdd import portable_hash

numPartitions = 21

udf_portable_hash = udf(lambda str: portable_hash(str))
cities_df_ = cities_df.withColumn("Hash#", 
                                  udf_portable_hash(cities_df['continent']))
cities_df_ = cities_df.withColumn("Partition#", 
                                  udf_portable_hash(cities_df['Hash#']%numPartitions))

AnalysisException: 'Cannot resolve column name "Hash#" among (continent, country, name, population);'

In [None]:
udf_portable_hash = udf