In [2]:
import numpy as np
import pandas as pd

import sqlite3

**Database: Chinook**

In [3]:
# Kết nối đến CSDL và tạo cursor
conn = sqlite3.connect('database/Chinook_Sqlite.sqlite')
cursor = conn.cursor()

# 1. Gom nhóm

## 1.1. Mệnh đề `GROUP BY`
(Lưu ý: `GROUP BY` cần kết hợp với `MAX`, `MIN`, `SUM`, `COUNT`, `AVG`)

In [227]:
# Kiểm tra dữ liệu bảng "track"
query = """
        SELECT *
        FROM track
        LIMIT (5)
"""
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[item[0] for item in cursor.description])

Unnamed: 0,TrackId,Name,AlbumId,MediaTypeId,GenreId,Composer,Milliseconds,Bytes,UnitPrice
0,1,For Those About To Rock (We Salute You),1,1,1,"Angus Young, Malcolm Young, Brian Johnson",343719,11170334,0.99
1,2,Balls to the Wall,2,2,1,,342562,5510424,0.99
2,3,Fast As a Shark,3,2,1,"F. Baltes, S. Kaufman, U. Dirkscneider & W. Ho...",230619,3990994,0.99
3,4,Restless and Wild,3,2,1,"F. Baltes, R.A. Smith-Diesel, S. Kaufman, U. D...",252051,4331779,0.99
4,5,Princess of the Dawn,3,2,1,Deaffy & R.A. Smith-Diesel,375418,6290521,0.99


In [155]:
# Gom nhóm bảng track theo AlbumID 
query = """
        SELECT *
        FROM track
        GROUP BY albumid
        LIMIT (5)
"""
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[item[0] for item in cursor.description])

Unnamed: 0,TrackId,Name,AlbumId,MediaTypeId,GenreId,Composer,Milliseconds,Bytes,UnitPrice
0,1,For Those About To Rock (We Salute You),1,1,1,"Angus Young, Malcolm Young, Brian Johnson",343719,11170334,0.99
1,2,Balls to the Wall,2,2,1,,342562,5510424,0.99
2,3,Fast As a Shark,3,2,1,"F. Baltes, S. Kaufman, U. Dirkscneider & W. Ho...",230619,3990994,0.99
3,15,Go Down,4,1,1,AC/DC,331180,10847611,0.99
4,23,Walk On Water,5,1,1,"Steven Tyler, Joe Perry, Jack Blades, Tommy Shaw",295680,9719579,0.99


**Mỗi album có bao nhiêu track?**

In [133]:
query = """
        SELECT albumid, count(trackid) as count
        FROM track
        GROUP BY AlbumId        
        ORDER BY count DESC
        LIMIT (10)
"""
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[item[0] for item in cursor.description])

Unnamed: 0,AlbumId,count
0,141,57
1,23,34
2,73,30
3,229,26
4,230,25
5,251,25
6,83,24
7,231,24
8,253,24
9,24,23


**Group theo nhiều cột: mỗi loại media có bao nhiêu genre, trong đó có bao nhiêu track?**

In [190]:
query = """
        SELECT MediaTypeId, GenreId, count(trackid) as count
        FROM track
        GROUP BY MediaTypeId, GenreId        
        ORDER BY MediaTypeId ASC
        --LIMIT (10)
"""
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[item[0] for item in cursor.description])

Unnamed: 0,MediaTypeId,GenreId,count
0,1,1,1211
1,1,2,127
2,1,3,374
3,1,4,332
4,1,5,12
5,1,6,81
6,1,7,578
7,1,8,58
8,1,9,14
9,1,10,42


**Thời gian trung bình (theo milisecond) của các track trong mỗi album**

In [228]:
query = """
        SELECT albumid, AVG(Milliseconds) as AVG_TIME
        FROM track
        GROUP BY AlbumId        
        ORDER BY AVG_TIME DESC
        LIMIT (10)
"""
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[item[0] for item in cursor.description])

Unnamed: 0,AlbumId,AVG_TIME
0,253,2925574.0
1,227,2778265.0
2,229,2717907.0
3,231,2637068.0
4,226,2622250.0
5,228,2599142.0
6,230,2594197.0
7,254,2484567.0
8,261,2321673.0
9,251,1532684.0


**Tổng dung lượng của các track trong từng album**

In [229]:
query = """
        SELECT albumid, SUM(Bytes) as sum_bytes, MIN(Bytes), MAX(bytes)
        FROM track
        GROUP BY AlbumId        
        ORDER BY sum_bytes DESC
        LIMIT (10)
"""
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[item[0] for item in cursor.description])

Unnamed: 0,AlbumId,sum_bytes,MIN(Bytes),MAX(bytes)
0,229,13917603291,486675063,1059546140
1,253,12872621850,487899692,587051735
2,231,12344960921,457364940,574325829
3,228,11781321607,475996611,549353481
4,227,10059916535,462818231,1054423946
5,261,7708725642,20831818,526865050
6,251,7652731262,245378749,515301752
7,250,5711964665,244626927,327642458
8,230,5280909854,183867185,228896396
9,249,1610359572,257879716,290482361


## 1.2. Kết hợp `GROUP BY` và `HAVING`

**Chỉ lọc ra các track có GenreId là 2 hoặc MediaTypeId là 2**

In [230]:
query = """
        SELECT albumid, SUM(Bytes) as sum_bytes, MIN(Bytes), MAX(bytes)
        FROM track
        GROUP BY AlbumId
        --HAVING GenreId=2
        HAVING MediaTypeId=2
        ORDER BY sum_bytes DESC
        LIMIT (10)
"""
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[item[0] for item in cursor.description])

Unnamed: 0,AlbumId,sum_bytes,MIN(Bytes),MAX(bytes)
0,255,83470311,2447453,4656660
1,94,79242814,4383764,9367328
2,91,77516317,2550030,10201342
3,256,67857091,3071042,8858616
4,257,57769571,3482099,6491444
5,269,56198102,3892066,11157785
6,270,54019835,3355715,4606408
7,90,53625668,3301971,6687123
8,322,49649864,2190831,10564704
9,321,40936903,2304465,6670600


**Chỉ lọc ra các album có tổng thời gian nằm trong khoảng 500k - 1000k milisec**

In [231]:
query = """
        SELECT albumid, AVG(Milliseconds) as AVG_TIME, SUM(Milliseconds) as SUM_TIME
        FROM track
        GROUP BY AlbumId
        HAVING SUM(Milliseconds) BETWEEN 500000 and 1000000
        ORDER BY AVG_TIME DESC
        LIMIT (10)
"""
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[item[0] for item in cursor.description])

Unnamed: 0,AlbumId,AVG_TIME,SUM_TIME
0,294,596519.0,596519
1,279,582029.0,582029
2,330,567494.0,567494
3,312,561967.0,561967
4,301,560342.0,560342
5,299,545203.0,545203
6,311,526696.0,526696
7,292,522099.0,522099
8,273,501503.0,501503
9,173,347972.0,695944


# 2. Truy vấn dữ liệu từ nhiều tables

## 2.1. Các mệnh đề JOIN

**(1) Mỗi album là của artist nào?**

In [233]:
# Thử với JOIN, INNER JOIN, CROSS JOIN, LEFT JOIN
query = """
        SELECT title, name
        FROM album
        JOIN artist
            ON album.artistid = artist.artistid
        LIMIT(10)
"""
cursor.execute(query)
cursor.fetchall()

[('For Those About To Rock We Salute You', 'AC/DC'),
 ('Balls to the Wall', 'Accept'),
 ('Restless and Wild', 'Accept'),
 ('Let There Be Rock', 'AC/DC'),
 ('Big Ones', 'Aerosmith'),
 ('Jagged Little Pill', 'Alanis Morissette'),
 ('Facelift', 'Alice In Chains'),
 ('Warner 25 Anos', 'Antônio Carlos Jobim'),
 ('Plays Metallica By Four Cellos', 'Apocalyptica'),
 ('Audioslave', 'Audioslave')]

In [56]:
query = """
        SELECT title as 'Album Title', name as 'Artist Name'
        FROM album
        JOIN artist
            ON album.artistid = artist.artistid
        --LIMIT(10)
"""
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[item[0] for item in cursor.description])
df

Unnamed: 0,Album Title,Artist Name
0,For Those About To Rock We Salute You,AC/DC
1,Balls to the Wall,Accept
2,Restless and Wild,Accept
3,Let There Be Rock,AC/DC
4,Big Ones,Aerosmith
...,...,...
342,Respighi:Pines of Rome,Eugene Ormandy
343,Schubert: The Late String Quartets & String Qu...,Emerson String Quartet
344,Monteverdi: L'Orfeo,"C. Monteverdi, Nigel Rogers - Chiaroscuro; Lon..."
345,Mozart: Chamber Music,Nash Ensemble


**Kiểm tra lại số lượng records của 2 bảng Album và Artist**

In [29]:
query = """
        SELECT COUNT(*)
        FROM album
"""
cursor.execute(query)
cursor.fetchone()

(347,)

In [48]:
query = """
        SELECT COUNT(*)
        FROM artist
"""
cursor.execute(query)
cursor.fetchone()

(275,)

In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 347 entries, 0 to 346
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Album Title  347 non-null    object
 1   Artist Name  347 non-null    object
dtypes: object(2)
memory usage: 5.5+ KB


**==> Có những artist có vài album**

In [76]:
df.groupby('Artist Name').count().size

204

**==> Có tổng cộng 275 artist, nhưng ở đây chỉ có 204 artist có album**

**==> Có 71 artist không có album nào**

In [73]:
df.groupby('Artist Name').count().nlargest(columns='Album Title', n = 5)

Unnamed: 0_level_0,Album Title
Artist Name,Unnamed: 1_level_1
Iron Maiden,21
Led Zeppelin,14
Deep Purple,11
Metallica,10
U2,10


**(2) Mỗi artist có những albums nào?**

In [234]:
query = """
        SELECT ar.name AS 'Artist Name', al.title AS 'Album Title'
        FROM Artist ar
        CROSS JOIN Album al 
            USING(artistid)
"""
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[item[0] for item in cursor.description])
df

Unnamed: 0,Artist Name,Album Title
0,AC/DC,For Those About To Rock We Salute You
1,AC/DC,Let There Be Rock
2,Accept,Balls to the Wall
3,Accept,Restless and Wild
4,Aerosmith,Big Ones
...,...,...
342,"Mela Tenenbaum, Pro Musica Prague & Richard Kapp","Locatelli: Concertos for Violin, Strings and C..."
343,Emerson String Quartet,Schubert: The Late String Quartets & String Qu...
344,"C. Monteverdi, Nigel Rogers - Chiaroscuro; Lon...",Monteverdi: L'Orfeo
345,Nash Ensemble,Mozart: Chamber Music


In [235]:
df.groupby('Artist Name').count().sort_values('Album Title')

Unnamed: 0_level_0,Album Title
Artist Name,Unnamed: 1_level_1
João Suplicy,1
Marcos Valle,1
Marillion,1
Marisa Monte,1
Martin Roscoe,1
...,...
U2,10
Metallica,10
Deep Purple,11
Led Zeppelin,14


**Dùng mệnh đề LEFT JOIN**

In [236]:
query = """
        SELECT ar.name AS 'Artist Name', al.title AS 'Album Title'
        FROM Artist ar
        LEFT JOIN Album al
            ON ar.artistid = al.artistid
"""
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[item[0] for item in cursor.description])
df

Unnamed: 0,Artist Name,Album Title
0,AC/DC,For Those About To Rock We Salute You
1,AC/DC,Let There Be Rock
2,Accept,Balls to the Wall
3,Accept,Restless and Wild
4,Aerosmith,Big Ones
...,...,...
413,"Mela Tenenbaum, Pro Musica Prague & Richard Kapp","Locatelli: Concertos for Violin, Strings and C..."
414,Emerson String Quartet,Schubert: The Late String Quartets & String Qu...
415,"C. Monteverdi, Nigel Rogers - Chiaroscuro; Lon...",Monteverdi: L'Orfeo
416,Nash Ensemble,Mozart: Chamber Music


In [106]:
df.groupby('Artist Name').count().sort_values('Album Title')

Unnamed: 0_level_0,Album Title
Artist Name,Unnamed: 1_level_1
A Cor Do Som,0
Pedro Luís & A Parede,0
Pedro Luís E A Parede,0
Peter Tosh,0
"Edson, DJ Marky & DJ Patife Featuring Fernanda Porto",0
...,...
Metallica,10
U2,10
Deep Purple,11
Led Zeppelin,14


**==> Lấy ra được các Artist không có Album nào**

## 2.2. Kết hợp các lệnh khác trong SQL

**(1) Kết hợp ORDER BY**

In [237]:
query = """
        SELECT ar.name AS 'Artist Name', al.title AS 'Album Title'
        FROM Artist ar
        LEFT JOIN Album al
            ON ar.artistid = al.artistid
        ORDER BY ar.name DESC
"""
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[item[0] for item in cursor.description])
df

Unnamed: 0,Artist Name,Album Title
0,Zeca Pagodinho,Ao Vivo [IMPORT]
1,Youssou N'Dour,
2,Yo-Yo Ma,Bach: The Cello Suites
3,Yehudi Menuhin,Bartok: Violin & Viola Concertos
4,Xis,
...,...,...
413,Aaron Goldberg,Worlds
414,Aaron Copland & London Symphony Orchestra,"A Copland Celebration, Vol. I"
415,AC/DC,For Those About To Rock We Salute You
416,AC/DC,Let There Be Rock


**(2) Kết hợp GROUP BY**

**Mỗi artist có bao nhiêu album? 10 artist có số album nhiều nhất?**

In [221]:
# Cách 1: Dùng lệnh count trực tiếp
query = """
        SELECT ar.name AS 'Artist Name', count(al.artistid) as NoAlbums
        FROM Artist ar
        INNER JOIN Album al
            ON ar.artistid = al.artistid
        GROUP BY ar.name
        ORDER BY NoAlbums DESC
        --LIMIT (10)
"""
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[item[0] for item in cursor.description])
df

Unnamed: 0,Artist Name,NoAlbums
0,Iron Maiden,21
1,Led Zeppelin,14
2,Deep Purple,11
3,U2,10
4,Metallica,10
...,...,...
199,"Academy of St. Martin in the Fields, John Birc...",1
200,Academy of St. Martin in the Fields Chamber En...,1
201,Academy of St. Martin in the Fields & Sir Nevi...,1
202,Aaron Goldberg,1


In [154]:
# Cách 2: Lồng ghép kết quả SELECT vào 1 SELECT khác
query = """
        SELECT ar.name, ar1.count
        FROM
            (SELECT al.artistid, count(al.albumid) as count
            FROM Album al
            GROUP BY al.artistid
            ORDER BY count DESC) ar1
        JOIN artist ar
            ON ar.artistid = ar1.artistid
        LIMIT(10)
"""
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[item[0] for item in cursor.description])
df

Unnamed: 0,Name,count
0,Iron Maiden,21
1,Led Zeppelin,14
2,Deep Purple,11
3,Metallica,10
4,U2,10
5,Ozzy Osbourne,6
6,Pearl Jam,5
7,Various Artists,4
8,Faith No More,4
9,Foo Fighters,4


**Tên của 10 album có số track nhiều nhất?**

In [166]:
query = """
        SELECT album.title, count
        FROM
            (SELECT albumid, count(trackid) as count
            FROM track
            GROUP BY AlbumId        
            ORDER BY count DESC
            LIMIT (10)) AS tr
        JOIN album
            ON album.albumid = tr.albumid
"""
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[item[0] for item in cursor.description])

Unnamed: 0,Title,count
0,Greatest Hits,57
1,Minha Historia,34
2,Unplugged,30
3,"Lost, Season 3",26
4,"Lost, Season 1",25
5,"The Office, Season 3",25
6,My Way: The Best Of Frank Sinatra [Disc 1],24
7,"Lost, Season 2",24
8,"Battlestar Galactica (Classic), Season 1",24
9,Afrociberdelia,23


## 2.3. Các mệnh đề UNION

In [226]:
query = """
        SELECT LastName, FirstName, 'Employee' AS Type
        FROM employee
        --LIMIT (5)
        UNION
        SELECT LastName, FirstName, 'Customer'
        FROM Customer
        --LIMIT (5)
"""
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[item[0] for item in cursor.description])

Unnamed: 0,LastName,FirstName,Type
0,Adams,Andrew,Employee
1,Almeida,Roberto,Customer
2,Barnett,Julia,Customer
3,Bernard,Camille,Customer
4,Brooks,Michelle,Customer
...,...,...,...
62,Tremblay,François,Customer
63,Van der Berg,Johannes,Customer
64,Wichterlová,František,Customer
65,Wójcik,Stanisław,Customer


# 3. Demo: NoSQL

**Tạo một database sử dụng kiểu `dict`**

In [177]:
db = {'lop': {'id': 7, 'ten': 'Data Science',
              'giang-vien': {'id':5, 'ten': 'Khai', 'tuoi': 33},
              'danh-sach':[{'hoc-vien': {'id':3, 'ten': 'Nam'}},
                           {'hoc-vien': {'id':5, 'ten': 'Lan'}}
                          ]
             }
     }
db

{'lop': {'id': 7,
  'ten': 'Data Science',
  'giang-vien': {'id': 5, 'ten': 'Khai', 'tuoi': 33},
  'danh-sach': [{'hoc-vien': {'id': 3, 'ten': 'Nam'}},
   {'hoc-vien': {'id': 5, 'ten': 'Lan'}}]}}

**Bản thân CSDL này là một dict với cặp key-value**

In [178]:
db.keys()

dict_keys(['lop'])

In [187]:
db.values()

dict_values([{'id': 7, 'ten': 'Data Science', 'giang-vien': {'id': 5, 'ten': 'Khai', 'tuoi': 33}, 'danh-sach': [{'hoc-vien': {'id': 3, 'ten': 'Nam'}}, {'hoc-vien': {'id': 5, 'ten': 'Lan'}}]}])

In [181]:
db['lop']['danh-sach'][1]

{'hoc-vien': {'id': 5, 'ten': 'Lan'}}

In [212]:
print(db['lop'].get('giang-vien'))

{'id': 5, 'ten': 'Khai', 'tuoi': 33}


In [184]:
print(db['lop'].get('mentor'))

None
