In [1]:
hostname = 'localhost'
username = 'postgres'
password = 'postgres'
port_id = 5432
database = 'pyspark_exercise_database'

In [2]:
import psycopg2
import psycopg2.extras
import pandas as pd

### Query which returns country name, its capital, GNP for country with highest value of GNP

In [50]:
try:
    with psycopg2.connect (

        host = hostname,
        dbname = database,
        user = username,
        password = password,
        port = port_id) as conn:

        with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:

            highest_gdp_query = '''SELECT pyspark_test.country.name AS country_name,
                                                 pyspark_test.city.name AS city_name,
                                                 CAST(doc ->> 'GNP' AS DOUBLE PRECISION) AS GNP
                                                 FROM pyspark_test.country_info

                                                 INNER JOIN pyspark_test.country
                                                 ON pyspark_test.country.code = pyspark_test.country_info.doc ->>'_id'

                                                 INNER JOIN pyspark_test.city
                                                 ON pyspark_test.city.id = pyspark_test.country.capital

                                                 ORDER BY GNP DESC
                                                 LIMIT 1

                                                 ;'''

            cur.execute(highest_gdp_query)
            highest_gdp_query_df = pd.DataFrame(cur.fetchall(), columns = ['country name', 'capital', 'GNP'])

except Exception as error:
    print(error)

finally:
    if conn is not None:
        conn.close()

In [51]:
highest_gdp_query_df

Unnamed: 0,country name,capital,GNP
0,United States,Washington,8510700.0


### Mininum, maximum and average GNP for continents

In [52]:
try:
    with psycopg2.connect (

        host = hostname,
        dbname = database,
        user = username,
        password = password,
        port = port_id) as conn:

        with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:

            gdp_per_continent_query = '''SELECT doc -> 'geography'->> 'Continent' AS Continent,
                                          MIN(CAST(doc ->> 'GNP' AS DOUBLE PRECISION)) AS min_gdp,
                                          MAX(CAST(doc ->> 'GNP' AS DOUBLE PRECISION)) AS max_gdp,
                                          AVG(CAST(doc ->> 'GNP' AS DOUBLE PRECISION)) AS avg_gdp
                                          FROM pyspark_test.country_info
                                          WHERE doc ->> 'GNP' is not null
                                          GROUP BY Continent
                                                 ;'''

            cur.execute(gdp_per_continent_query)
            gdp_per_continent_query_df = pd.DataFrame(cur.fetchall(), columns = ['continent', 'min GDP', 'max GDP', 'average GDP'])

except Exception as error:
    print(error)

finally:
    if conn is not None:
        conn.close()

In [53]:
gdp_per_continent_query_df

Unnamed: 0,continent,min GDP,max GDP,average GDP
0,South America,0.0,776739.0,107991.0
1,Oceania,0.0,351182.0,14991.953571
2,Antarctica,0.0,0.0,0.0
3,Africa,0.0,116729.0,10006.465517
4,Asia,0.0,3787042.0,150105.72549
5,North America,0.0,8510700.0,261854.789189
6,Europe,0.0,2133367.0,206497.065217


In [54]:
try:
    with psycopg2.connect (

        host = hostname,
        dbname = database,
        user = username,
        password = password,
        port = port_id) as conn:

        with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:

            all_cities_in_north_america = '''SELECT name
                                             FROM pyspark_test.city
                                             INNER JOIN pyspark_test.country_info
                                             ON pyspark_test.city.CountryCode = pyspark_test.country_info.doc ->> '_id'
                                             WHERE doc -> 'geography' ->> 'Region' = 'North America'
                                                 ;'''

            cur.execute(all_cities_in_north_america)
            all_cities_in_north_america_df = pd.DataFrame(cur.fetchall(), columns = ['country name'])

except Exception as error:
    print(error)

finally:
    if conn is not None:
        conn.close()

In [55]:
all_cities_in_north_america_df

Unnamed: 0,country name
0,Saint George
1,Hamilton
2,Nuuk
3,Montréal
4,Calgary
...,...
322,Kenosha
323,Elgin
324,Odessa
325,Carson


### Countries for which value of field 'Head of State' contains 'Elisabeth' - easy regex task

In [56]:
try:
    with psycopg2.connect (

        host = hostname,
        dbname = database,
        user = username,
        password = password,
        port = port_id) as conn:

        with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:

            elizabeth_in_head_of_state = '''SELECT doc ->> 'Name' AS country_name
                                             FROM pyspark_test.country_info
                                             WHERE doc -> 'government' ->> 'HeadOfState' LIKE '%Elisabeth%'
                                                 ;'''

            cur.execute(elizabeth_in_head_of_state)
            elizabeth_in_head_of_state_df = pd.DataFrame(cur.fetchall(), columns = ['country name'])

except Exception as error:
    print(error)

finally:
    if conn is not None:
        conn.close()

In [57]:
elizabeth_in_head_of_state_df

Unnamed: 0,country name
0,Anguilla
1,Antigua and Barbuda
2,Australia
3,Bahamas
4,Belize
5,Bermuda
6,Barbados
7,Canada
8,Cocos (Keeling) Islands
9,Cook Islands


In [58]:
try:
    with psycopg2.connect (

        host = hostname,
        dbname = database,
        user = username,
        password = password,
        port = port_id) as conn:

        with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:

            number_of_countries_in_each_continent = '''SELECT doc -> 'geography' ->> 'Continent',
                                             Count(doc ->> 'Name') 
                                             FROM pyspark_test.country_info
                                             GROUP BY doc -> 'geography' ->> 'Continent'
                                                 ;'''

            cur.execute(number_of_countries_in_each_continent)
            number_of_countries_in_each_continent_df = pd.DataFrame(cur.fetchall(), columns = ['continent', 'number of countries'])

except Exception as error:
    print(error)

finally:
    if conn is not None:
        conn.close()

### We can sort this table by number of countries, who is forbiding us?

In [59]:
number_of_countries_in_each_continent_df.sort_values('number of countries', ascending=False)

Unnamed: 0,continent,number of countries
3,Africa,58
4,Asia,51
6,Europe,46
5,North America,37
1,Oceania,28
0,South America,14
2,Antarctica,5


In [14]:
try:
    with psycopg2.connect (

        host = hostname,
        dbname = database,
        user = username,
        password = password,
        port = port_id) as conn:

        with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:

            life_exp_query = '''

                                             (SELECT doc ->> 'Name' as country_name,
                                             CAST(doc -> 'demographics' ->> 'LifeExpectancy' AS DOUBLE PRECISION) AS LifeExp
                                             FROM pyspark_test.country_info
                                             ORDER BY CAST(doc -> 'demographics' ->> 'LifeExpectancy' AS DOUBLE PRECISION) DESC NULLS LAST
                                             LIMIT 10)

                                             UNION 

                                             (SELECT doc ->> 'Name' AS country_name,
                                             CAST(doc -> 'demographics' ->> 'LifeExpectancy' AS DOUBLE PRECISION) AS LifeExp
                                             FROM pyspark_test.country_info
                                             ORDER BY CAST(doc -> 'demographics' ->> 'LifeExpectancy' AS DOUBLE PRECISION) ASC NULLS LAST
                                             LIMIT 10)

                                             ORDER BY LifeExp DESC


                                                 ;'''

            cur.execute(life_exp_query)
            life_exp_query_df = pd.DataFrame(cur.fetchall(), columns = ['country', 'life expectancy'])

except Exception as error:
    print(error)

finally:
    if conn is not None:
        conn.close()

In [15]:
life_exp_query_df

Unnamed: 0,country,life expectancy
0,Andorra,83.5
1,Macao,81.599998
2,San Marino,81.099998
3,Japan,80.699997
4,Singapore,80.099998
5,Australia,79.800003
6,Switzerland,79.599998
7,Sweden,79.599998
8,Hong Kong,79.5
9,Canada,79.400002
