In [1]:
import pandas as pd
import sqlite3 as sql
from pandasql import sqldf

# Cities

In [2]:
cities = pd.read_csv('cities.csv')

In [5]:
query_ci = '''SELECT * 
FROM cities;'''
sqldf(query_ci, env=None).head()

Unnamed: 0,name,country_code,city_proper_pop,metroarea_pop,urbanarea_pop
0,Abidjan,CIV,4765000,,4765000
1,Abu Dhabi,ARE,1145000,,1145000
2,Abuja,NGA,1235880,6000000.0,1235880
3,Accra,GHA,2070463,4010054.0,2070463
4,Addis Ababa,ETH,3103673,4567857.0,3103673


cities has 236 rows × 5 columns

# Countries

In [6]:
countries = pd.read_csv('countries.csv')
query_co = '''
SELECT * 
FROM countries;'''
sqldf(query_co, env=None).head()

Unnamed: 0,code,country_name,continent,region,surface_area,indep_year,local_name,gov_form,capital,cap_long,cap_lat
0,AFG,Afghanistan,Asia,Southern and Central Asia,652090.0,1919.0,Afganistan/Afqanestan,Islamic Emirate,Kabul,69.1761,34.5228
1,NLD,Netherlands,Europe,Western Europe,41526.0,1581.0,Nederland,Constitutional Monarchy,Amsterdam,4.89095,52.3738
2,ALB,Albania,Europe,Southern Europe,28748.0,1912.0,Shqiperia,Republic,Tirane,19.8172,41.3317
3,DZA,Algeria,Africa,Northern Africa,2381740.0,1962.0,Al-Jazair/Algerie,Republic,Algiers,3.05097,36.7397
4,ASM,American Samoa,Oceania,Polynesia,199.0,,Amerika Samoa,US Territory,Pago Pago,-170.691,-14.2846


countries has 205 rows × 11 columns

# LEFT and RIGHT JOINs

- This is a LEFT JOIN, right?\
Nice work getting to grips with the structure of joins! In this exercise, you'll explore the differences between INNER JOIN and LEFT JOIN. This will help you decide which type of join to use.\
As before, you will be using the cities and countries tables.\
You'll begin with an INNER JOIN with the cities table (left) and countries table (right). This helps if you are interested only in records where a country is present in both tables.\
You'll then change to a LEFT JOIN. This helps if you're interested in returning all countries in the cities table, whether or not they have a match in the countries table.

In [8]:
# Perform an inner join with cities as c1 and countries as c2 on country code
query = '''SELECT 
    c1.name AS city,
    code,
    c2.country_name AS country,
    region,
    city_proper_pop
FROM cities AS c1
INNER JOIN countries AS c2
ON c1.country_code = c2.code
ORDER BY code DESC;'''
sqldf(query, env=None)

Unnamed: 0,city,code,country,region,city_proper_pop
0,Harare,ZWE,Zimbabwe,Eastern Africa,1606000
1,Lusaka,ZMB,Zambia,Eastern Africa,1742979
2,Cape Town,ZAF,South Africa,Southern Africa,3740026
3,Durban,ZAF,South Africa,Southern Africa,3442361
4,Ekurhuleni,ZAF,South Africa,Southern Africa,3178470
...,...,...,...,...,...
225,Rosario,ARG,Argentina,South America,1193605
226,Abu Dhabi,ARE,United Arab Emirates,Middle East,1145000
227,Dubai,ARE,United Arab Emirates,Middle East,2643410
228,Luanda,AGO,Angola,Central Africa,2825311


In [10]:
# Join right table (with alias)
left_join = '''SELECT 
	c1.name AS city, 
    code, 
    c2.country_name AS country,
    region, 
    city_proper_pop
FROM cities AS c1
LEFT JOIN countries AS c2
ON c1.country_code = c2.code
ORDER BY code DESC;'''
sqldf(left_join,env=None)

Unnamed: 0,city,code,country,region,city_proper_pop
0,Harare,ZWE,Zimbabwe,Eastern Africa,1606000
1,Lusaka,ZMB,Zambia,Eastern Africa,1742979
2,Cape Town,ZAF,South Africa,Southern Africa,3740026
3,Durban,ZAF,South Africa,Southern Africa,3442361
4,Ekurhuleni,ZAF,South Africa,Southern Africa,3178470
...,...,...,...,...,...
231,Kaohsiung,,,,2778918
232,New Taipei City,,,,3954929
233,Taichung,,,,2752413
234,Tainan,,,,1885252


- Perfect! Notice that the INNER JOIN resulted in 230 records, whereas the LEFT JOIN returned 236 records. Remember that the LEFT JOIN is a type of outer join: its result is not limited to only those records that have matches for both tables on the joining field.

# Building on your LEFT JOIN
You'll now revisit the use of the AVG() function introduced in a previous course.\
Being able to build more than one SQL function into your query will enable you to write compact, supercharged queries.\
You will use AVG() in combination with a LEFT JOIN to determine the average gross domestic product (GDP) per capita by region in 2010.

In [11]:
economies = pd.read_csv('economies.csv')

In [13]:
query_e = '''
SELECT * 
FROM economies;'''
sqldf(query_e, env=None).head()

Unnamed: 0,econ_id,code,year,income_group,gdp_percapita,gross_savings,inflation_rate,total_investment,unemployment_rate,exports,imports
0,1,AFG,2010,Low income,539.667,37.133,2.179,30.402,,46.394,24.381
1,2,AFG,2015,Low income,615.091,21.466,-1.549,18.602,,-49.11,-7.294
2,3,AGO,2010,Upper middle income,3599.27,23.534,14.48,14.433,,-3.266,-21.076
3,4,AGO,2015,Upper middle income,3876.2,-0.425,10.287,9.552,,6.721,-21.778
4,5,ALB,2010,Upper middle income,4098.13,20.011,3.605,31.305,14.0,10.645,-8.013


In [18]:
# Match on code fields
# Filter for the year 2010
left_join_e = '''SELECT country_name, region, gdp_percapita, year
FROM countries AS c
LEFT JOIN economies AS e
ON c.code = e.code
WHERE year = 2010;'''
sqldf(left_join_e,env=None)

Unnamed: 0,country_name,region,gdp_percapita,year
0,Afghanistan,Southern and Central Asia,539.667,2010
1,Angola,Central Africa,3599.270,2010
2,Albania,Southern Europe,4098.130,2010
3,United Arab Emirates,Middle East,34628.630,2010
4,Argentina,South America,10412.950,2010
...,...,...,...,...
179,Samoa,Polynesia,3434.100,2010
180,Yemen,Middle East,1266.790,2010
181,South Africa,Southern Africa,7361.940,2010
182,Zambia,Eastern Africa,1456.160,2010


In [21]:
# Select region, and average gdp_percapita as avg_gdp
# Group by region
region = '''SELECT region, AVG(gdp_percapita) AS avg_gdp,year
FROM countries AS c
LEFT JOIN economies AS e
USING(code)
WHERE year = 2010
GROUP BY region;'''
sqldf(region, env=None)

Unnamed: 0,region,avg_gdp,year
0,Australia and New Zealand,44792.385,2010
1,Baltic Countries,12631.03,2010
2,British Islands,43588.33,2010
3,Caribbean,11413.339462,2010
4,Central Africa,4797.239889,2010
5,Central America,4969.97,2010
6,Eastern Africa,1757.348187,2010
7,Eastern Asia,24962.808,2010
8,Eastern Europe,10095.456667,2010
9,Melanesia,2532.61,2010


In [23]:
# Order by descending avg_gdp
# Return only first 10 records
avg_gdp = '''SELECT region, AVG(gdp_percapita) AS avg_gdp, year
FROM countries AS c
LEFT JOIN economies AS e
USING(code)
WHERE year = 2010
GROUP BY region
ORDER BY avg_gdp DESC
LIMIT 10;'''
sqldf(avg_gdp,env=None)

Unnamed: 0,region,avg_gdp,year
0,Western Europe,58130.962857,2010
1,Nordic Countries,57073.998,2010
2,North America,47911.51,2010
3,Australia and New Zealand,44792.385,2010
4,British Islands,43588.33,2010
5,Eastern Asia,24962.808,2010
6,Southern Europe,22926.410909,2010
7,Middle East,18204.641765,2010
8,Baltic Countries,12631.03,2010
9,Caribbean,11413.339462,2010


- Nice work! You successfully executed a LEFT JOIN and applied a GROUP BY to the result of your JOIN. Building up your SQL vocabulary in this way will enable you to answer questions of ever-increasing complexity!

# Is this RIGHT?
You learned that right joins are not used as commonly as left joins. A key reason for this is that right joins can always be re-written as left joins, and because joins are typically typed from left to right, joining from the left feels more intuitive when constructing queries.\
It can be tricky to wrap one's head around when left and right joins return equivalent results. You'll explore this in this exercise!

In [24]:
languages = pd.read_csv('languages.csv')

In [27]:
# Modify this query to use RIGHT JOIN instead of LEFT JOIN
r_join = '''SELECT countries.country_name AS country, languages.name AS language, percent
FROM languages
RIGHT JOIN countries
USING(code)
ORDER BY language;'''
sqldf(r_join,env=None)

Unnamed: 0,country,language,percent
0,"Virgin Islands, British",,
1,Fiji Islands,,
2,Cape Verde,,
3,Macao,,
4,Czech Republic,,
...,...,...,...
911,Poland,unspecified,1.30
912,Rwanda,unspecified,0.30
913,Seychelles,unspecified,1.40
914,Tonga,unspecified,0.03


- Correct: when converting a LEFT JOIN to a RIGHT JOIN, change both the type of join and the order of the tables to get equivalent results. You would get different results if you only changed the table order. The order of fields you are joining ON still does not matter.

# FULL JOINs

# Comparing joins
In this exercise, you'll examine how results can differ when performing a full join compared to a left join and inner join by joining the countries and currencies tables. You'll be focusing on the North American region and records where the name of the country is missing.\
You'll begin with a full join with countries on the left and currencies on the right. Recall the workings of a full join with the diagram below!

In [28]:
currencies = pd.read_csv('currencies.csv')

In [30]:
# Join to currencies
# Where region is North America or name is null
f_join = '''SELECT country_name AS country, code, region, basic_unit
FROM countries
FULL JOIN currencies 
USING (code)
WHERE region = 'North America' OR country_name IS null
ORDER BY region;'''
sqldf(f_join,env=None)

Unnamed: 0,country,code,region,basic_unit
0,,AIA,,East Caribbean dollar
1,,IOT,,United States dollar
2,,CCK,,Australian dollar
3,,COK,,New Zealand dollar
4,,TMP,,United States dollar
5,,FLK,,Falkland Islands pound
6,,HKG,,Hong Kong dollar
7,,MSR,,East Caribbean dollar
8,,NIU,,New Zealand dollar
9,,ROM,,Romanian leu


In [32]:
# Join to currencies
l_j = '''SELECT country_name AS country, code, region, basic_unit
FROM countries
LEFT JOIN currencies 
USING (code)
WHERE region = 'North America' 
	OR country_name IS NULL
ORDER BY region;'''
sqldf(l_j,env=None)

Unnamed: 0,country,code,region,basic_unit
0,Bermuda,BMU,North America,Bermudian dollar
1,Greenland,GRL,North America,
2,Canada,CAN,North America,Canadian dollar
3,United States,USA,North America,United States dollar


In [33]:
# Join to currencies
i_j = '''SELECT country_name AS country, code, region, basic_unit
FROM countries
INNER JOIN currencies 
USING (code)
WHERE region = 'North America' 
	OR country_name IS NULL
ORDER BY region;'''
sqldf(i_j,env=None)

Unnamed: 0,country,code,region,basic_unit
0,Bermuda,BMU,North America,Bermudian dollar
1,Canada,CAN,North America,Canadian dollar
2,United States,USA,North America,United States dollar


- Have you kept an eye out for the different numbers of records these queries returned? The FULL JOIN query returned 18 records, the LEFT JOIN returned four records, and the INNER JOIN only returned three records. Does this add more color to the diagrams you have seen for these three types of join?

# Chaining FULL JOINs
As you have seen in the previous chapter on INNER JOIN, it is possible to chain joins in SQL, such as when looking to connect data from more than two tables.\
Suppose you are doing some research on Melanesia and Micronesia, and are interested in pulling information about languages and currencies into the data we see for these regions in the countries table. Since languages and currencies exist in separate tables, this will require two consecutive full joins involving the countries, languages and currencies tables.

In [38]:
f = '''SELECT c1.country_name AS country, region, l.name AS language, basic_unit, frac_unit
FROM countries as c1
FULL JOIN languages AS l
ON c1.code = l.code
FULL JOIN currencies AS c2
ON c1.code = c2.code
WHERE region LIKE "M%esia";'''
sqldf(f,env=None)

Unnamed: 0,country,region,language,basic_unit,frac_unit
0,Fiji Islands,Melanesia,,,
1,Guam,Micronesia,English,,
2,Guam,Micronesia,Filipino,,
3,Guam,Micronesia,Chamorro,,
4,Guam,Micronesia,Other Pacific Islander,,
5,Guam,Micronesia,Asian,,
6,Guam,Micronesia,Other,,
7,Kiribati,Micronesia,Kiribati,Australian dollar,Cent
8,Kiribati,Micronesia,English,Australian dollar,Cent
9,Marshall Islands,Micronesia,Marshallese,United States dollar,Cent


- Well done! The first FULL JOIN in the query pulled countries and languages, and the second FULL JOIN added in currency data for each record in the result of the first FULL JOIN.

# Crossing into CROSS JOIN

- Histories and languages\
Well done getting to know all about CROSS JOIN! As you have learned, CROSS JOIN can be incredibly helpful when asking questions that involve looking at all possible combinations or pairings between two sets of data.\
Imagine you are a researcher interested in the languages spoken in two countries: Pakistan and India. You are interested in asking:\
What are the languages presently spoken in the two countries?
Given the shared history between the two countries, what languages could potentially have been spoken in either country over the course of their history?
In this exercise, we will explore how INNER JOIN and CROSS JOIN can help us answer these two questions, respectively.

In [40]:
i = '''SELECT c.country_name AS country, l.name AS language
FROM countries AS c
INNER JOIN languages AS l
ON c.code = l.code
WHERE c.code IN ("PAK", "IND")
AND l.code IN ("PAK", "IND");'''
sqldf(i,env=None)

Unnamed: 0,country,language
0,India,Assamese
1,India,Bengali
2,India,Gujarati
3,India,Hindi
4,India,Kannada
5,India,Maithili
6,India,Malayalam
7,India,Marathi
8,India,Oriya
9,India,Other


In [42]:
c = '''SELECT c.country_name AS country, l.name AS language
FROM countries AS c
CROSS JOIN languages AS l
WHERE c.code IN ("PAK", "IND")
AND l.code IN ("PAK", "IND");'''
sqldf(c,env=None)

Unnamed: 0,country,language
0,India,Hindi
1,India,Bengali
2,India,Telugu
3,India,Marathi
4,India,Tamil
5,India,Urdu
6,India,Gujarati
7,India,Kannada
8,India,Malayalam
9,India,Oriya


- Nice one! Notice that the INNER JOIN returned 25 records, whereas the CROSS JOIN returned 50 records, as it took all combinations of languages returned by the INNER JOIN for both countries. Notice that this returns duplicate records in cases where both countries speak the same language. We will learn how to deal with duplicates in subsequent lessons.

# Choosing your join
Now that you're fully equipped to use joins, try a challenge problem to test your knowledge!\
You will determine the names of the five countries and their respective regions with the lowest life expectancy for the year 2010. Use your knowledge about joins, filtering, sorting and limiting to create this list!

In [44]:
populations = pd.read_csv('populations.csv')

In [46]:
r = '''SELECT 
	c.country_name AS country,
    region,
    life_expectancy AS life_exp
FROM countries AS c
-- Join to populations (alias as p) using an appropriate join
INNER JOIN populations AS p
ON c.code = p.country_code
-- Filter for only results in the year 2010
WHERE year = 2010
-- Sort by life_exp
ORDER BY life_exp
-- Limit to five records
LIMIT 10;'''
sqldf(r,env=None)

Unnamed: 0,country,region,life_exp
0,Andorra,Southern Europe,
1,American Samoa,Polynesia,
2,Cayman Islands,Caribbean,
3,Dominica,Caribbean,
4,Gibraltar,Southern Europe,
5,Saint Kitts and Nevis,Caribbean,
6,Monaco,Western Europe,
7,Marshall Islands,Micronesia,
8,Northern Mariana Islands,Micronesia,
9,Nauru,Micronesia,


- Nice work! Did you notice that more than one type of join can be used to return the five records in our result? All four types of joins we have learned will return the same result.

# Self joins

- Comparing a country to itself\
Self joins are very useful for comparing data from one part of a table with another part of the same table. Suppose you are interested in finding out how much the populations for each country changed from 2010 to 2015. You can visualize this change by performing a self join.\
In this exercise, you'll work to answer this question by joining the populations table with itself. Recall that, with self joins, tables must be aliased. Use this as an opportunity to practice your aliasing!\
Since you'll be joining the populations table to itself, you can alias populations first as p1 and again as p2. This is good practice whenever you are aliasing tables with the same first letter.

In [52]:
# Select aliased fields from populations as p1
s = '''SELECT p1.country_code,
p1.size AS size2010,
p2.size AS size2015,
p1.year,
p2.year
FROM populations AS p1
-- Join populations as p1 to itself, alias as p2, on country code
INNER JOIN populations AS p2 
USING (country_code);'''
sqldf(s,env=None)

Unnamed: 0,country_code,size2010,size2015,year,year.1
0,ABW,101597.0,101597.0,2010,2010
1,ABW,101597.0,103889.0,2010,2015
2,ABW,103889.0,101597.0,2015,2010
3,ABW,103889.0,103889.0,2015,2015
4,AFG,27962207.0,27962207.0,2010,2010
...,...,...,...,...,...
863,ZMB,16211767.0,16211767.0,2015,2015
864,ZWE,13973897.0,13973897.0,2010,2010
865,ZWE,13973897.0,15602751.0,2010,2015
866,ZWE,15602751.0,13973897.0,2015,2010


In [51]:
j = '''SELECT
	p1.country_code, 
    p1.size AS size2010, 
    p2.size AS size2015,
    p1.year,
    p2.year
FROM populations AS p1
INNER JOIN populations AS p2
ON p1.country_code = p2.country_code
WHERE p1.year = 2010
-- Filter such that p1.year is always five years before p2.year
    AND p1.year = p2.year - 5;'''
sqldf(j,env=None)

Unnamed: 0,country_code,size2010,size2015,year,year.1
0,ABW,101597.0,103889.0,2010,2015
1,AFG,27962207.0,32526562.0,2010,2015
2,AGO,21219954.0,25021974.0,2010,2015
3,ALB,2913021.0,2889167.0,2010,2015
4,AND,84419.0,70473.0,2010,2015
...,...,...,...,...,...
212,XKX,1775680.0,1801800.0,2010,2015
213,YEM,23591972.0,26832215.0,2010,2015
214,ZAF,50979432.0,55011977.0,2010,2015
215,ZMB,13917439.0,16211767.0,2010,2015


- Nice one! See how it's possible to eliminate unwanted records using a calculated field, such as the one you've subtracted five from? That's a great trick to know.