In [1]:
import os
import findspark
findspark.init()
import pyspark
import random

driver_memory = '4g'
num_executors = 2
executor_memory = '1g'
#pyspark_submit_args = ' --driver-memory ' + driver_memory + ' --executor-memory ' + executor_memory + ' --num-executors ' + num_executors + ' pyspark-shell'
pyspark_submit_args = ' --driver-memory ' + driver_memory + ' pyspark-shell'

os.environ["PYSPARK_SUBMIT_ARGS"] = pyspark_submit_args

In [2]:
sc = pyspark.SparkContext(appName="Pi")

In [3]:
num_samples = 100000000
def inside(p):     
  x, y = random.random(), random.random()
  return x*x + y*y < 1
count = sc.parallelize(range(0, num_samples)).filter(inside).count()
pi = 4.0 * count / num_samples
print(pi)

3.14217336


In [8]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("basics").getOrCreate()

In [5]:
#country = sqlContext.read.format("csv").option("header", "true").option("inferSchema", "true").load("WDICountry.csv")
country = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("WDICountry.csv")


In [6]:
country.show()

+--------------------+--------------------+--------------------+--------------------+------------+--------------------+--------------------+--------------------+-------------------+---------+---------------------------+--------------------------------+--------------------+----------------+------------+---------------------------+-----------------------------+--------------------+---------------------------------+------------------------------+--------------------+-----------------------------+-------------------------------+------------------------+-----------------------+-------------------------------------------------+---------------------------+--------------------------+----------------------+-----------------+----------------------------+----+
|        Country Code|          Short Name|          Table Name|           Long Name|2-alpha code|       Currency Unit|       Special Notes|              Region|       Income Group|WB-2 code|National accounts base year|National accounts ref

In [7]:
country.printSchema()

root
 |-- Country Code: string (nullable = true)
 |-- Short Name: string (nullable = true)
 |-- Table Name: string (nullable = true)
 |-- Long Name: string (nullable = true)
 |-- 2-alpha code: string (nullable = true)
 |-- Currency Unit: string (nullable = true)
 |-- Special Notes: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Income Group: string (nullable = true)
 |-- WB-2 code: string (nullable = true)
 |-- National accounts base year: string (nullable = true)
 |-- National accounts reference year: string (nullable = true)
 |-- SNA price valuation: string (nullable = true)
 |-- Lending category: string (nullable = true)
 |-- Other groups: string (nullable = true)
 |-- System of National Accounts: string (nullable = true)
 |-- Alternative conversion factor: string (nullable = true)
 |-- PPP survey year: string (nullable = true)
 |-- Balance of Payments Manual in use: string (nullable = true)
 |-- External debt Reporting status: string (nullable = true)
 |-- Syst

In [8]:
country.withColumnRenamed("Country Code", "country_code").registerTempTable("country")

In [9]:
#sqlContext.sql("select * from country where country_code like 'IN%'").show()
spark.sql("select * from country where country_code like 'IN%'").show()

+------------+----------+----------+-----------------+------------+-------------+--------------------+----------+-------------------+---------+---------------------------+--------------------------------+--------------------+----------------+------------+---------------------------+-----------------------------+---------------+---------------------------------+------------------------------+--------------------+-----------------------------+-------------------------------+------------------------+-----------------------+-------------------------------------------------+---------------------------+--------------------------+----------------------+-----------------+----------------------------+----+
|country_code|Short Name|Table Name|        Long Name|2-alpha code|Currency Unit|       Special Notes|    Region|       Income Group|WB-2 code|National accounts base year|National accounts reference year| SNA price valuation|Lending category|Other groups|System of National Accounts|Alternativ

In [10]:
#indicators = sqlContext.read.format("csv").option("header", "true").option("inferSchema", "true").load("WDIData.csv")
indicators = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("WDIData.csv")


In [11]:
indicators.printSchema()

root
 |-- Country Name: string (nullable = true)
 |-- Country Code: string (nullable = true)
 |-- Indicator Name: string (nullable = true)
 |-- Indicator Code: string (nullable = true)
 |-- 1960: double (nullable = true)
 |-- 1961: double (nullable = true)
 |-- 1962: double (nullable = true)
 |-- 1963: double (nullable = true)
 |-- 1964: double (nullable = true)
 |-- 1965: double (nullable = true)
 |-- 1966: double (nullable = true)
 |-- 1967: double (nullable = true)
 |-- 1968: double (nullable = true)
 |-- 1969: double (nullable = true)
 |-- 1970: double (nullable = true)
 |-- 1971: double (nullable = true)
 |-- 1972: double (nullable = true)
 |-- 1973: double (nullable = true)
 |-- 1974: double (nullable = true)
 |-- 1975: double (nullable = true)
 |-- 1976: double (nullable = true)
 |-- 1977: double (nullable = true)
 |-- 1978: double (nullable = true)
 |-- 1979: double (nullable = true)
 |-- 1980: double (nullable = true)
 |-- 1981: double (nullable = true)
 |-- 1982: double (null

In [12]:
indicators.show()

+------------+------------+--------------------+--------------------+----+----+----+----+----+----+----+----+----+----+----------------+----------------+----------------+----------------+---------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----+
|Country Name|Country Code|      Indicator Name|      Indica

In [13]:
indicators.groupBy("Country Name").count().show()

+--------------------+-----+
|        Country Name|count|
+--------------------+-----+
|          South Asia| 1591|
|                Chad| 1591|
| Lower middle income| 1591|
|            Paraguay| 1591|
| Low & middle income| 1591|
|Heavily indebted ...| 1591|
|               World| 1591|
|    Congo, Dem. Rep.| 1591|
|             Senegal| 1591|
|East Asia & Pacif...| 1591|
|          Cabo Verde| 1591|
|              Sweden| 1591|
|            Kiribati| 1591|
|Least developed c...| 1591|
|      Macedonia, FYR| 1591|
|              Guyana| 1591|
|Pacific island sm...| 1591|
|             Eritrea| 1591|
|         Philippines| 1591|
|            Djibouti| 1591|
+--------------------+-----+
only showing top 20 rows



In [14]:
indicators.groupBy("Indicator Name").agg({"Country Name": "collect_set"}).withColumnRenamed("collect_set(Country Name)", "country_set").show()

+--------------------+--------------------+
|      Indicator Name|         country_set|
+--------------------+--------------------+
|Adjusted savings:...|[Sint Maarten (Du...|
|Commitments, offi...|[Sint Maarten (Du...|
|Completeness of i...|[Sint Maarten (Du...|
|Educational attai...|[Sint Maarten (Du...|
|Employers, female...|[Sint Maarten (Du...|
|Forest area (% of...|[Sint Maarten (Du...|
|People using safe...|[Sint Maarten (Du...|
|Tariff rate, appl...|[Sint Maarten (Du...|
|Trained teachers ...|[Sint Maarten (Du...|
|Wage and salaried...|[Sint Maarten (Du...|
|Coverage of socia...|[Sint Maarten (Du...|
|Electricity produ...|[Sint Maarten (Du...|
|PNG, bonds (NFL, ...|[Sint Maarten (Du...|
|Portfolio investm...|[Sint Maarten (Du...|
|Armed forces pers...|[Sint Maarten (Du...|
|CPIA economic man...|[Sint Maarten (Du...|
|Children out of s...|[Sint Maarten (Du...|
|Commitments, priv...|[Sint Maarten (Du...|
|Gross domestic sa...|[Sint Maarten (Du...|
|Income share held...|[Sint Maar