In [1]:
import numpy as np
import pandas as pd
import pyodbc
import sqlalchemy
import sqlite3

%load_ext sql

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [2]:
%sql sqlite://

'Connected: @None'

## 1. SELECT Single & multiple columns

While SQL can be used to create and modify databases, the focus of this course will be querying databases. A query is a request for data from a database table (or combination of tables). Querying is an essential skill for a data scientist, since the data you need for your analyses will often live in databases.

In [3]:
from subprocess import check_output

conn = sqlite3.connect('/kaggle/input/sqlite-sample/SQL_test.sqlite')
exectue = conn.cursor()

### 1.1. Selecting single column

In `SQL`, we can select data from a table using a **SELECT** `statement`. 

For example, the following query selects the `A09_village` column from the `Farms` table

In [4]:
SQL_farms_single_col = pd.read_sql(
    """
    SELECT A09_village 
    FROM Farms
    """, con = conn)
SQL_farms_single_col

Unnamed: 0,A09_village
0,God
1,God
2,God
3,God
4,God
...,...
345,Ndedja
346,Ndedja
347,Ndedja
348,Ndedja


### 1.2. Selecting multiple columns

Sometimes, you may want to select all columns from a table. Typing out every column name would be a pain, so there's a handy shortcut:

In [5]:
SQL_farms_single_col = pd.read_sql(
    """
    SELECT Id, A06_province, A07_district, G02_months_lack_food, A01_interview_date
    FROM Farms
    """, con = conn)
SQL_farms_single_col

Unnamed: 0,Id,A06_province,A07_district,G02_months_lack_food,A01_interview_date
0,1,Manica,Manica,['Jan'],17/11/2016
1,2,Manica,Manica,"['Jan', 'Sept', 'Oct', 'Nov', 'Dec']",17/11/2016
2,3,Manica,Manica,"['Jan', 'Feb', 'Mar', 'Oct', 'Nov', 'Dec']",17/11/2016
3,4,Manica,Manica,"['Sept', 'Oct', 'Nov', 'Dec']",17/11/2016
4,5,Manica,Manica,"['Aug', 'Sept', 'Oct', 'Nov']",17/11/2016
...,...,...,...,...,...
345,346,Sofala,Nhamatanda,"['Jan', 'Feb', 'Mar', 'Dec']",22/08/2017
346,347,Sofala,Nhamatanda,"['Jan', 'Feb', 'Mar', 'Dec']",22/08/2017
347,348,Sofala,Nhamatanda,['none'],22/08/2017
348,349,Sofala,Nhamatanda,"['Jan', 'Feb', 'Mar', 'Dec']",22/08/2017


## 2. SELECTING ALL COLUMNS
In the real world, you will often want to select multiple columns. Luckily, SQL makes this really easy. To select multiple columns from a table, simply separate the column names with commas!

For example, this query selects two columns, name and birthdate, from the people table:
### 2.1. All columns & all rows

In [6]:
SQL_farms_all_cols = pd.read_sql(
    """
    SELECT * FROM Farms
    """, con = conn)
SQL_farms_all_cols

Unnamed: 0,Id,Country,A01_interview_date,A03_quest_no,A04_start,A05_end,A06_province,A07_district,A08_ward,A09_village,...,F13_du_look_aftr_cows,F14_items_owned,G01_no_meals,G02_months_lack_food,G03_no_food_mitigation,gps:Latitude,gps:Longitude,gps:Altitude,gps:Accuracy,instanceID
0,1,Moz,17/11/2016,1,2017-03-23T09:49:57.000Z,2017-04-02T17:29:08.000Z,Manica,Manica,Bandula,God,...,no,"['bicycle', 'television', 'solar_panel', 'table']",2,['Jan'],"['na', 'rely_less_food', 'reduce_meals', 'day_...",-19.112259,33.483456,698.0,14.0,uuid:ec241f2c-0609-46ed-b5e8-fe575f6cefef
1,2,Moz,17/11/2016,1,2017-04-02T09:48:16.000Z,2017-04-02T17:26:19.000Z,Manica,Manica,Bandula,God,...,no,"['cow_cart', 'bicycle', 'radio', 'cow_plough',...",2,"['Jan', 'Sept', 'Oct', 'Nov', 'Dec']","['na', 'reduce_meals', 'restrict_adults', 'bor...",-19.112477,33.483416,690.0,19.0,uuid:099de9c9-3e5e-427b-8452-26250e840d6e
2,3,Moz,17/11/2016,3,2017-04-02T14:35:26.000Z,2017-04-02T17:26:53.000Z,Manica,Manica,Bandula,God,...,no,['solar_torch'],2,"['Jan', 'Feb', 'Mar', 'Oct', 'Nov', 'Dec']","['na', 'restrict_adults', 'lab_ex_food']",-19.112108,33.483450,674.0,13.0,uuid:193d7daf-9582-409b-bf09-027dd36f9007
3,4,Moz,17/11/2016,4,2017-04-02T14:55:18.000Z,2017-04-02T17:27:16.000Z,Manica,Manica,Bandula,God,...,no,"['bicycle', 'radio', 'cow_plough', 'solar_pane...",2,"['Sept', 'Oct', 'Nov', 'Dec']","['na', 'reduce_meals', 'restrict_adults', 'lab...",-19.112229,33.483424,679.0,5.0,uuid:148d1105-778a-4755-aa71-281eadd4a973
4,5,Moz,17/11/2016,5,2017-04-02T15:10:35.000Z,2017-04-02T17:27:35.000Z,Manica,Manica,Bandula,God,...,no,"['motorcyle', 'radio', 'cow_plough', 'mobile_p...",2,"['Aug', 'Sept', 'Oct', 'Nov']","['na', 'go_forest', 'migrate']",-19.112217,33.483425,689.0,10.0,uuid:2c867811-9696-4966-9866-f35c3e97d02d
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345,346,Moz,22/08/2017,207,2017-08-22T10:35:17.000Z,2017-08-22T10:41:35.000Z,Sofala,Nhamatanda,Lamego,Ndedja,...,no,['radio'],3,"['Jan', 'Feb', 'Mar', 'Dec']","['rely_less_food', 'limit_variety', 'borrow_fo...",-19.110984,33.475643,722.0,117.0,uuid:abdba24f-7bca-4d79-8a01-273547da52f5
346,347,Moz,22/08/2017,209,2017-08-22T10:42:15.000Z,2017-08-23T06:21:54.000Z,Sofala,Nhamatanda,Lamego,Ndedja,...,no,,3,"['Jan', 'Feb', 'Mar', 'Dec']","['rely_less_food', 'reduce_meals', 'borrow_foo...",-19.111314,33.476522,0.0,82.5,uuid:ef587b92-5dc4-47b3-b631-a3b191649066
347,348,Moz,22/08/2017,205,2017-08-23T05:08:13.000Z,2017-08-23T05:27:06.000Z,Sofala,Nhamatanda,Lamego,Ndedja,...,no,"['bicycle', 'radio', 'solar_torch', 'mobile_ph...",3,['none'],"['rely_less_food', 'limit_variety', 'reduce_me...",-19.111314,33.476522,0.0,72.9,uuid:2eb827ea-c94d-4a55-bd64-59bc67459c17
348,349,Moz,22/08/2017,206,2017-08-23T05:39:40.000Z,2017-08-23T05:53:54.000Z,Sofala,Nhamatanda,Lamego,Ndedja,...,no,['radio'],3,"['Jan', 'Feb', 'Mar', 'Dec']","['rely_less_food', 'limit_variety', 'reduce_me...",-19.111402,33.475848,0.0,45.6,uuid:d8dc7800-eb7d-4b8f-84be-4dcddeb55b18


### 2.2. LIMITS rows

And if we want to return a certain numbers of rows; we can add a keyword **LIMIT** to limit the numbers of results returned!

In [7]:
SQL_crops_6_rows = pd.read_sql(
    """
    SELECT * FROM Farms
    LIMIT 6;
    """, con = conn)
SQL_crops_6_rows

Unnamed: 0,Id,Country,A01_interview_date,A03_quest_no,A04_start,A05_end,A06_province,A07_district,A08_ward,A09_village,...,F13_du_look_aftr_cows,F14_items_owned,G01_no_meals,G02_months_lack_food,G03_no_food_mitigation,gps:Latitude,gps:Longitude,gps:Altitude,gps:Accuracy,instanceID
0,1,Moz,17/11/2016,1,2017-03-23T09:49:57.000Z,2017-04-02T17:29:08.000Z,Manica,Manica,Bandula,God,...,no,"['bicycle', 'television', 'solar_panel', 'table']",2,['Jan'],"['na', 'rely_less_food', 'reduce_meals', 'day_...",-19.112259,33.483456,698.0,14.0,uuid:ec241f2c-0609-46ed-b5e8-fe575f6cefef
1,2,Moz,17/11/2016,1,2017-04-02T09:48:16.000Z,2017-04-02T17:26:19.000Z,Manica,Manica,Bandula,God,...,no,"['cow_cart', 'bicycle', 'radio', 'cow_plough',...",2,"['Jan', 'Sept', 'Oct', 'Nov', 'Dec']","['na', 'reduce_meals', 'restrict_adults', 'bor...",-19.112477,33.483416,690.0,19.0,uuid:099de9c9-3e5e-427b-8452-26250e840d6e
2,3,Moz,17/11/2016,3,2017-04-02T14:35:26.000Z,2017-04-02T17:26:53.000Z,Manica,Manica,Bandula,God,...,no,['solar_torch'],2,"['Jan', 'Feb', 'Mar', 'Oct', 'Nov', 'Dec']","['na', 'restrict_adults', 'lab_ex_food']",-19.112108,33.48345,674.0,13.0,uuid:193d7daf-9582-409b-bf09-027dd36f9007
3,4,Moz,17/11/2016,4,2017-04-02T14:55:18.000Z,2017-04-02T17:27:16.000Z,Manica,Manica,Bandula,God,...,no,"['bicycle', 'radio', 'cow_plough', 'solar_pane...",2,"['Sept', 'Oct', 'Nov', 'Dec']","['na', 'reduce_meals', 'restrict_adults', 'lab...",-19.112229,33.483424,679.0,5.0,uuid:148d1105-778a-4755-aa71-281eadd4a973
4,5,Moz,17/11/2016,5,2017-04-02T15:10:35.000Z,2017-04-02T17:27:35.000Z,Manica,Manica,Bandula,God,...,no,"['motorcyle', 'radio', 'cow_plough', 'mobile_p...",2,"['Aug', 'Sept', 'Oct', 'Nov']","['na', 'go_forest', 'migrate']",-19.112217,33.483425,689.0,10.0,uuid:2c867811-9696-4966-9866-f35c3e97d02d
5,6,Moz,17/11/2016,6,2017-04-02T15:27:25.000Z,2017-04-02T17:28:02.000Z,Manica,Manica,Bandula,God,...,no,,2,"['Aug', 'Sept', 'Oct']","['borrow_food', 'lab_ex_food', 'seek_government']",-19.112196,33.483392,692.0,12.0,uuid:daa56c91-c8e3-44c3-a663-af6a49a2ca70


## 3. SELECT DISTINCT

Often your results will include many duplicate values. If you want to select all the unique values from a column, you can use the DISTINCT keyword.

This might be useful if, for example, you're interested in knowing which languages are represented in the films table:


In [8]:
SQL_crops_distinct = pd.read_sql(
    """
    SELECT DISTINCT A01_interview_date
    FROM Farms
    """, con = conn)
SQL_crops_distinct

Unnamed: 0,A01_interview_date
0,17/11/2016
1,16/11/2016
2,16/12/2016
3,21/11/2016
4,24/11/2016
5,18/11/2016
6,23/11/2016
7,25/11/2016
8,28/11/2016
9,26/04/2017


We can see that there are 349 observations in the database `Farms` but the `unique_values` of `A01_interview_date 's column` has 51 `distinct_values` only.