In [1]:
import pandas as pd
import sqlite3 as sql
from pandasql import sqldf

In [2]:
cities = pd.read_csv('cities.csv')
countries = pd.read_csv('countries.csv')
economies = pd.read_csv('economies.csv')
languages = pd.read_csv('languages.csv')
currencies = pd.read_csv('currencies.csv')
populations = pd.read_csv('populations.csv')
economies2015 = pd.read_csv('economies2015.csv')
economies2019 = pd.read_csv('economies2019.csv')
monarchs = pd.read_csv('monarchs.csv')
presidents = pd.read_csv('presidents.csv')
prime_ministers = pd.read_csv('prime_ministers.csv')
prime_ministers_terms = pd.read_csv('prime_minister_terms.csv')
states = pd.read_csv('states.csv')

# Subquerying with semi joins and anti joins

 # Semi-join
- The purpose is to retrieve the records from the left table for which there are corresponding matching records in the right table.
- The result includes the matching records from both tables.
- It uses INNER JOIN, EXISTS subquery, or IN operator
- It is used to find the common elements between two sets.
- Example: Selecting customers who placed orders.

- Semi-join is a type of join that is applied to relations to join them based on the related columns. When semi-join is applied, it returns the rows from one table for which there are matching records in another related table.\
- A Semi-join returns rows from the left table for which there are corresponding matching rows in the right table.
Unlike regular joins which include the matching rows from both tables, a semi-join only includes columns from the left table in the result.

SELECT column1, column2,\
FROM table1\
WHERE EXISTS\
(SELECT 1  FROM table2  WHERE table1.column = table2.column);

SELECT column1, column2,\
FROM table1\
WHERE column IN (SELECT column FROM table2);

Anti-Join
- The purpose is to retrieve the records from the left table for which there are corresponding matching records in the right table.
- The result excludes the matching records from both tables.
- It uses the LEFT JOIN or NOT EXISTS subquery.
- It is used to identify the elements that are unique to one set and not present in another.
- Example: Selecting customers who haven’t placed any orders.

- Anti-join or Anti-semi-join is a type of join that is applied to relations to join them based on the related columns. When anti-join is applied, it returns the rows from one table for which there are no matching records in another related table.

Characteristics:\
It is Exactly the opposite to semi-join.\
An Anti-join returns rows from the left table for which there are no corresponding matching rows in the right table

# Conclusion
Anti-join filters out rows based on the absence of matches in another table, while semi-join filters rows based on the existence of related records but only returns columns from the first table. Understanding the differences between these two operations is crucial for effective data analysis and reporting in relational databases.

- Semi join\
Great job getting acquainted with semi joins and anti joins! You are now going to practice using semi joins.\
Let's say you are interested in identifying languages spoken in the Middle East. The languages table contains information about languages and countries, but it does not tell you what region the countries belong to. You can build up a semi join by filtering the countries table by a particular region, and then using this to further filter the languages table.\
You'll build up your semi join as you did in the video exercise, block by block, starting with a selection of countries from the countries table, and then leveraging a WHERE clause to filter the languages table by this selection.

In [10]:
# Select country code for countries in the Middle East
m = '''SELECT code, country_name
FROM countries
WHERE region = 'Middle East';'''
sqldf(m,env=None)

Unnamed: 0,code,country_name
0,ARE,United Arab Emirates
1,ARM,Armenia
2,AZE,Azerbaijan
3,BHR,Bahrain
4,GEO,Georgia
5,IRQ,Iraq
6,ISR,Israel
7,YEM,Yemen
8,JOR,Jordan
9,KWT,Kuwait


In [11]:
# Select unique language names
l = '''SELECT DISTINCT name
FROM languages
-- Order by the name of the language
ORDER BY name;'''
sqldf(l,env=None)

Unnamed: 0,name
0,Afar
1,Afrikaans
2,Akyem
3,Albanian
4,Alsatian
...,...
391,siSwati
392,sign
393,tribal
394,unknown


In [12]:
me = '''SELECT DISTINCT name
FROM languages
-- Add syntax to use bracketed subquery below as a filter
WHERE code IN 
    (SELECT code
    FROM countries
    WHERE region = 'Middle East')
ORDER BY name;'''
sqldf(me,env=None)

Unnamed: 0,name
0,Arabic
1,Aramaic
2,Armenian
3,Azerbaijani
4,Azeri
5,Baluchi
6,Bulgarian
7,Circassian
8,English
9,Farsi


- Well done writing your first subquery in the form of a semi join. Think of all the opportunities that open up when queries become building blocks of larger queries!

# Diagnosing problems using anti join
Nice work on semi joins! The anti join is a related and powerful joining tool. It can be particularly useful for identifying whether an incorrect number of records appears in a join.

Say you are interested in identifying currencies of Oceanian countries. You have written the following INNER JOIN, which returns 15 records. Now, you want to ensure that all Oceanian countries from the countries table are included in this result. You'll do this in the first step.

If there are any Oceanian countries excluded in this INNER JOIN, you want to return the names of these countries. You'll write an anti join to this in the second step!

In [17]:
j = '''SELECT c1.code, country_name, basic_unit AS currency
FROM countries AS c1
INNER JOIN currencies AS c2
ON c1.code = c2.code
WHERE c1.continent = 'Oceania';'''
sqldf(j,env=None)

Unnamed: 0,code,country_name,currency
0,AUS,Australia,Australian dollar
1,KIR,Kiribati,Australian dollar
2,MHL,Marshall Islands,United States dollar
3,NRU,Nauru,Australian dollar
4,PLW,Palau,United States dollar
5,PNG,Papua New Guinea,Papua New Guinean kina
6,PYF,French Polynesia,CFP franc
7,SLB,Solomon Islands,Solomon Islands dollar
8,WSM,Samoa,Samoan tala
9,TON,Tonga,Tongan paʻanga


In [14]:
o = '''SELECT code, country_name
FROM countries
WHERE continent = 'Oceania';'''
sqldf(o,env=None)

Unnamed: 0,code,country_name
0,ASM,American Samoa
1,AUS,Australia
2,FJI,Fiji Islands
3,GUM,Guam
4,KIR,Kiribati
5,MHL,Marshall Islands
6,FSM,"Micronesia, Federated States of"
7,NRU,Nauru
8,PLW,Palau
9,PNG,Papua New Guinea


In [16]:
n  = '''SELECT code, country_name
FROM countries
WHERE continent = 'Oceania'
-- Filter for countries not included in the bracketed subquery
  AND code NOT IN 
  (SELECT code 
  FROM currencies);'''
sqldf(n,env=None)

Unnamed: 0,code,country_name
0,ASM,American Samoa
1,FJI,Fiji Islands
2,GUM,Guam
3,FSM,"Micronesia, Federated States of"
4,MNP,Northern Mariana Islands


- Nice! Your anti join determined which five out of 19 countries that were not included in the INNER JOIN provided. Did you notice that Tuvalu has two currencies, and therefore shows up twice in the INNER JOIN? This is why the INNER JOIN returned 15 rather than 14 results.

# Subqueries inside WHERE and SELECT

The video pointed out that subqueries inside WHERE can either be from the same table or a different table. In this exercise, you will nest a subquery from the populations table inside another query from the same table, populations. Your goal is to figure out which countries had high average life expectancies in 2015.

You can use SQL to do calculations for you. Suppose you only want records from 2015 with life_expectancy above 1.15 * avg_life_expectancy. You could use the following SQL query.

In the first step, you'll write a query to calculate a value for avg_life_expectancy. In the second step, you will nest this calculation into another query.

In [23]:
# Select average life_expectancy from the populations table
l = '''SELECT AVG(life_expectancy) AS avg_life_expectancy
FROM populations
-- Filter for the year 2015
WHERE year = 2015;'''
sqldf(l,env=None)

Unnamed: 0,avg_life_expectancy
0,71.676342


In [24]:
a = '''SELECT *
FROM populations
WHERE year = 2015
-- Filter for only those populations where life expectancy is 1.15 times higher than average
  AND life_expectancy > 1.15 * 
  (SELECT AVG(life_expectancy) 
   FROM populations
   WHERE year = 2015);'''
sqldf(a,env=None)

Unnamed: 0,pop_id,country_code,year,fertility_rate,life_expectancy,size
0,21,AUS,2015,1.833,82.45122,23789752.0
1,376,CHE,2015,1.54,83.197561,8281430.0
2,356,ESP,2015,1.32,83.380488,46443994.0
3,134,FRA,2015,2.01,82.670732,66538391.0
4,170,HKG,2015,1.195,84.278049,7305700.0
5,174,ISL,2015,1.93,82.860976,330815.0
6,190,ITA,2015,1.37,83.490244,60730582.0
7,194,JPN,2015,1.46,83.843659,126958472.0
8,340,SGP,2015,1.24,82.595122,5535002.0
9,374,SWE,2015,1.88,82.55122,9799186.0


- Nice work! You may recognize many of these country codes as being relatively wealthy countries, which makes sense as we might expect life expectancy to be higher in wealthier nations.

# WHERE do people live?
In this exercise, you will strengthen your knowledge of subquerying by identifying capital cities in order of largest to smallest population.\
Follow the instructions below to get the urban area population for capital cities only. You'll use the countries and cities tables displayed in the console to help identify columns of interest as you build your query.

In [28]:
# Select relevant fields from cities table
c = '''SELECT name AS capital, country_code, urbanarea_pop
FROM cities
-- Filter using a subquery on the countries table
WHERE name IN 
(SELECT capital 
FROM countries)
ORDER BY urbanarea_pop DESC;'''
sqldf(c,env=None)

Unnamed: 0,capital,country_code,urbanarea_pop
0,Beijing,CHN,21516000
1,Dhaka,BGD,14543124
2,Tokyo,JPN,13513734
3,Moscow,RUS,12197596
4,Cairo,EGY,10230350
...,...,...,...
61,Dakar,SEN,1146053
62,Abu Dhabi,ARE,1145000
63,Tripoli,LBY,1126000
64,Yerevan,ARM,1060138


- Alright! You've got plenty of practice on subqueries inside WHERE. Let's move on to subqueries inside the SELECT statement.

In [32]:
# Find top nine countries with the most cities
nc = '''SELECT countries.country_name AS country, COUNT(*) AS cities_num
FROM countries
LEFT JOIN cities
ON countries.code = cities.country_code
-- Order by count of cities as cities_num
GROUP BY country
ORDER BY cities_num DESC, country
-- Limit the results
LIMIT 9;'''
sqldf(nc,env=None)

Unnamed: 0,country,cities_num
0,China,36
1,India,18
2,Japan,11
3,Brazil,10
4,Pakistan,9
5,United States,9
6,Indonesia,7
7,Russian Federation,7
8,South Korea,7


In [33]:
# Find top nine countries with the most cities
n = '''SELECT countries.country_name AS country, COUNT(*) AS cities_num
FROM countries
LEFT JOIN cities
ON countries.code = cities.country_code
-- Order by count of cities as cities_num
GROUP BY country
ORDER BY cities_num DESC
-- Limit the results
LIMIT 9;'''
sqldf(n,env=None)

Unnamed: 0,country,cities_num
0,China,36
1,India,18
2,Japan,11
3,Brazil,10
4,United States,9
5,Pakistan,9
6,South Korea,7
7,Russian Federation,7
8,Indonesia,7


In [34]:
a = '''SELECT countries.country_name AS country,
-- Subquery that provides the count of cities   
  (SELECT COUNT(*)
   FROM cities
   WHERE cities.country_code = countries.code) AS cities_num
FROM countries
ORDER BY cities_num DESC, country
LIMIT 9;'''
sqldf(a,env=None)

Unnamed: 0,country,cities_num
0,China,36
1,India,18
2,Japan,11
3,Brazil,10
4,Pakistan,9
5,United States,9
6,Indonesia,7
7,Russian Federation,7
8,South Korea,7


- Excellent job! Notice how the subquery involves only one additional step in your SELECT statement, whereas the JOIN and GROUP BY are a two-step process.

# Subqueries inside FROM

- Subqueries inside FROM can help select columns from multiple tables in a single query.\
Say you are interested in determining the number of languages spoken for each country. You want to present this information alongside each country's local_name, which is a field only present in the countries table and not in the languages table. You'll use a subquery inside FROM to bring information from these two tables together!

In [3]:
# Select code, and language count as lang_num
c = '''SELECT code, COUNT(*) AS lang_num
FROM languages
GROUP BY code;'''
sqldf(c,env=None)

Unnamed: 0,code,lang_num
0,ABW,7
1,AFG,4
2,AGO,12
3,AIA,1
4,ALB,4
...,...,...
207,WSM,2
208,YEM,1
209,ZAF,13
210,ZMB,19


In [10]:
# Select local_name and lang_num from appropriate tables
lang = '''SELECT local_name, sub.lang_num
FROM countries,
 (SELECT code, COUNT(*) AS lang_num
 FROM languages
 GROUP BY code) AS sub
-- Where codes match
WHERE countries.code = sub.code
ORDER BY lang_num DESC;'''
sqldf(lang,env=None)

Unnamed: 0,local_name,lang_num
0,Zambia,19
1,YeItyop´iya,16
2,Zimbabwe,16
3,Bharat/India,14
4,Nepal,14
...,...,...
193,Saint Kitts and Nevis,1
194,Deutschland,1
195,San Marino,1
196,Al-´Arabiya as-Sa´udiya,1


- This one brought several joining concepts together! Did you notice that the top three countries in our result are located in Africa?

# Subquery challenge
You're near the finish line! Test your understanding of subquerying with a challenge problem.\
Suppose you're interested in analyzing inflation and unemployment rate for certain countries in 2015. You are interested in countries with "Republic" or "Monarchy" as their form of government.\
You will use the field gov_form to filter for these two conditions, which represents a country's form of government. You can review the different entries for gov_form in the countries table.

In [13]:
# Select relevant fields
r_m = '''SELECT code, inflation_rate, unemployment_rate
FROM economies
WHERE year = 2015 
  AND code IN
-- Subquery returning country codes filtered on gov_form
	(SELECT code
     FROM countries
     WHERE gov_form LIKE '%Monarchy%' OR gov_form LIKE '%Republic%')
ORDER BY inflation_rate;'''
sqldf(r_m,env=None)

Unnamed: 0,code,inflation_rate,unemployment_rate
0,ARG,,
1,SYR,,
2,LBN,-3.749,
3,ZWE,-2.410,
4,KNA,-2.302,
...,...,...,...
173,GHA,17.153,
174,MWI,21.858,
175,YEM,39.403,
176,UKR,48.684,9.143


- Superb work writing the majority of the query yourself. You can see that in 2015, Saint Kitts and Nevis (with country code KNA) had inflation of -2.3%.

# Final challenge
You've made it to the final challenge problem! Get ready to tackle this step-by-step.\
Your task is to determine the top 10 capital cities in Europe and the Americas by city_perc, a metric you'll calculate. city_perc is a percentage that calculates the "proper" population in a city as a percentage of the total population in the wider metro area, as follows:\
city_proper_pop / metroarea_pop * 100
Do not use table aliasing in this exercise.

In [20]:
# Select fields from cities
ci = '''SELECT name,
country_code, 
city_proper_pop,
metroarea_pop,
city_proper_pop / metroarea_pop * 100 AS city_perc
FROM cities
-- Use subquery to filter city name
WHERE name IN
(SELECT capital
FROM countries
WHERE continent LIKE '%Europe%' OR continent LIKE '%America%')
-- Add filter condition such that metroarea_pop does not have null values
AND metroarea_pop IS NOT NULL
-- Sort and limit the result
ORDER BY city_perc;'''
sqldf(ci,env=None)

Unnamed: 0,name,country_code,city_proper_pop,metroarea_pop,city_perc
0,Paris,FRA,2229621,10601122.0,21.031934
1,Buenos Aires,ARG,3054300,14122000.0,21.627956
2,Mexico City,MEX,8974724,20063000.0,44.732712
3,Guatemala City,GTM,2110100,4500000.0,46.891111
4,Warsaw,POL,1753977,3100844.0,56.564503
5,Quito,ECU,2671191,4700000.0,56.833851
6,Berlin,DEU,3517424,5871022.0,59.911613
7,Budapest,HUN,1759407,2927944.0,60.090186
8,London,GBR,8673713,13879757.0,62.491822
9,Brasilia,BRA,2556149,3919864.0,65.210145


In [19]:
# Select fields from cities
d = '''SELECT name,
country_code, 
city_proper_pop,
metroarea_pop,
city_proper_pop / metroarea_pop * 100 AS city_perc
FROM cities
-- Use subquery to filter city name
WHERE name IN
(SELECT capital
FROM countries
WHERE (continent = 'Europe'
 OR continent LIKE '%America'))
-- Add filter condition such that metroarea_pop does not have null values
AND metroarea_pop IS NOT NULL
-- Sort and limit the result
ORDER BY city_perc DESC
LIMIT 10;'''
sqldf(d,env=None)

Unnamed: 0,name,country_code,city_proper_pop,metroarea_pop,city_perc
0,Lima,PER,8852000,10750000.0,82.344186
1,Bogota,COL,7878783,9800000.0,80.395745
2,Moscow,RUS,12197596,16170000.0,75.433494
3,Vienna,AUT,1863881,2600000.0,71.687731
4,Montevideo,URY,1305082,1947604.0,67.009618
5,Caracas,VEN,1943901,2923959.0,66.481815
6,Rome,ITA,2877215,4353775.0,66.085523
7,Brasilia,BRA,2556149,3919864.0,65.210145
8,London,GBR,8673713,13879757.0,62.491822
9,Budapest,HUN,1759407,2927944.0,60.090186


- You've identified that Lima has the highest percentage of people living in the city 'proper', relative to the wider metropolitan population! Nicely done getting to the top of the summit.