# Grouping Data with SQL

In [2]:
import sqlite3
import pandas as pd
db = sqlite3.connect(r"C:\Users\nrmmw\Documents\Flatiron\Repos\Phase_2\dsc-grouping-data-with-sql\data.sqlite")
pd.read_sql("""
SELECT *
FROM sqlite_master
""", db)

Unnamed: 0,type,name,tbl_name,rootpage,sql
0,table,productlines,productlines,46,"CREATE TABLE `productlines` (`productLine`, `t..."
1,table,offices,offices,32,"CREATE TABLE ""offices"" (\n\t""officeCode""\tINTE..."
2,table,customers,customers,57,"CREATE TABLE ""customers"" (\n\t""customerNumber""..."
3,table,employees,employees,35,"CREATE TABLE ""employees"" (\n\t""employeeNumber""..."
4,table,orderdetails,orderdetails,2,"CREATE TABLE ""orderdetails"" (\n\t""orderNumber""..."
5,table,orders,orders,27,"CREATE TABLE ""orders"" (\n\t""orderNumber""\tINTE..."
6,table,payments,payments,28,"CREATE TABLE ""payments"" (\n\t""customerNumber""\..."
7,table,products,products,4,"CREATE TABLE ""products"" (\n\t""productCode""\tTE..."


In [5]:
pd.read_sql("""
SELECT * FROM customers;
""", db)

Unnamed: 0,customerNumber,customerName,contactLastName,contactFirstName,phone,addressLine1,addressLine2,city,state,postalCode,country,salesRepEmployeeNumber,creditLimit
0,103,Atelier graphique,Schmitt,Carine,40.32.2555,"54, rue Royale",,Nantes,,44000,France,1370,21000
1,112,Signal Gift Stores,King,Jean,7025551838,8489 Strong St.,,Las Vegas,NV,83030,USA,1166,71800
2,114,"Australian Collectors, Co.",Ferguson,Peter,03 9520 4555,636 St Kilda Road,Level 3,Melbourne,Victoria,3004,Australia,1611,117300
3,119,La Rochelle Gifts,Labrune,Janine,40.67.8555,"67, rue des Cinquante Otages",,Nantes,,44000,France,1370,118200
4,121,Baane Mini Imports,Bergulfsen,Jonas,07-98 9555,Erling Skakkes gate 78,,Stavern,,4110,Norway,1504,81700
...,...,...,...,...,...,...,...,...,...,...,...,...,...
117,486,Motor Mint Distributors Inc.,Salazar,Rosa,2155559857,11328 Douglas Av.,,Philadelphia,PA,71270,USA,1323,72600
118,487,Signal Collectibles Ltd.,Taylor,Sue,4155554312,2793 Furth Circle,,Brisbane,CA,94217,USA,1165,60300
119,489,"Double Decker Gift Stores, Ltd",Smith,Thomas,(171) 555-7555,120 Hanover Sq.,,London,,WA1 1DP,UK,1501,43300
120,495,Diecast Collectables,Franco,Valarie,6175552555,6251 Ingle Ln.,,Boston,MA,51003,USA,1188,85100


The `GROUP BY` clause groups records into summary rows and returns one record for each group.

Typically involves an aggregate function like COUNT, AVG, SUM, MAX, MIN

**USES OF `GROUPBY`**
- Count the number of records for each group

In [7]:
# Count of Customers by Country
pd.read_sql("""
SELECT country, COUNT(*) AS customers_count
FROM customers
GROUP BY country
ORDER BY customers_count DESC;
""", db)

Unnamed: 0,country,customers_count
0,USA,36
1,Germany,13
2,France,12
3,Spain,7
4,UK,5
5,Australia,5
6,New Zealand,4
7,Italy,4
8,Switzerland,3
9,Singapore,3


In [11]:
# Getting payment summary stats; grouped per customer
pd.read_sql("""
SELECT 
    customerNUmber,
    COUNT(*) AS number_of_payments,
    MIN(amount) AS min_purchase,
    MAX(amount) AS max_purchase,
    AVG(amount) AS avg_purchase,
    SUM(amount) AS total_spent
FROM payments
GROUP BY customerNumber
ORDER BY total_spent DESC;
""", db)

Unnamed: 0,customerNumber,number_of_payments,min_purchase,max_purchase,avg_purchase,total_spent
0,141,13,20009.53,120166.58,55056.844615,715738.98
1,124,9,11044.30,111654.40,64909.804444,584188.24
2,114,4,7565.08,82261.22,45146.267500,180585.07
3,151,4,20314.44,58841.35,44478.487500,177913.95
4,148,4,2611.84,105743.00,39062.757500,156251.03
...,...,...,...,...,...,...
93,381,4,1128.20,14379.90,7304.295000,29217.18
94,473,2,7612.06,17746.26,12679.160000,25358.32
95,103,3,1676.14,14571.44,7438.120000,22314.36
96,198,3,5858.56,9658.74,7184.753333,21554.26


`WHERE` can also be used to filter the data

In [13]:
pd.read_sql("""
SELECT 
    customerNUmber,
    COUNT(*) AS number_of_payments,
    MIN(amount) AS min_purchase,
    MAX(amount) AS max_purchase,
    AVG(amount) AS avg_purchase,
    SUM(amount) AS total_spent
FROM payments
WHERE strftime("%Y", paymentDate) = '2004'
GROUP BY customerNumber
ORDER BY total_spent DESC;
""", db)

Unnamed: 0,customerNumber,number_of_payments,min_purchase,max_purchase,avg_purchase,total_spent
0,141,6,20009.53,116208.40,48960.918333,293765.51
1,124,4,43369.30,85410.87,57890.632500,231562.53
2,114,2,44894.74,82261.22,63577.980000,127155.96
3,298,2,47375.92,61402.00,54388.960000,108777.92
4,166,3,22474.17,44160.92,35140.190000,105420.57
...,...,...,...,...,...,...
83,489,1,7310.42,7310.42,7310.420000,7310.42
84,495,1,6276.60,6276.60,6276.600000,6276.60
85,484,1,3474.66,3474.66,3474.660000,3474.66
86,148,1,2611.84,2611.84,2611.840000,2611.84


## The `HAVING` clause

The having clause can be used to filter our aggregated views.

Works like the `WHERE` clause except it is used to filter data selections on conditions after the `GROUP BY` clause.

Let us only select aggregated payment info about customers with over 50K in average payment amounts

`WHERE` and `HAVING` can also be used in the same query

In [17]:
pd.read_sql("""
SELECT
    customerNumber,
    COUNT(*) AS num_of_purchases,
    MAX(amount) AS max_purchase,
    MIN(amount) AS min_purchase,
    AVG(amount) AS avg_purchase,
    SUM(amount) AS total_spent
FROM payments
GROUP BY customerNumber
HAVING avg_purchase > 50000;
""", db)

Unnamed: 0,customerNumber,num_of_purchases,max_purchase,min_purchase,avg_purchase,total_spent
0,124,9,111654.4,11044.3,64909.804444,584188.24
1,141,13,120166.58,20009.53,55056.844615,715738.98
2,239,1,80375.24,80375.24,80375.24,80375.24
3,298,2,61402.0,47375.92,54388.96,108777.92
4,321,2,85559.12,46781.66,66170.39,132340.78
5,450,1,59551.38,59551.38,59551.38,59551.38


Let's say we want to filter based on customers who have made **at least 2 purchases of over 50000 each**.

Practicing using `WHERE` and `HAVING`

In [20]:
pd.read_sql("""
SELECT
    customerNumber,
    COUNT(*) AS num_payments,
    MIN(amount) AS min_purchase,
    MAX(amount) AS max_purchase,
    AVG(amount) AS avg_purchase,
    SUM(amount) AS total_spent
FROM payments
WHERE amount > 50000
GROUP BY customerNumber
HAVING num_payments >= 2
""", db)

Unnamed: 0,customerNumber,num_payments,min_purchase,max_purchase,avg_purchase,total_spent
0,124,5,55639.66,111654.4,87509.512,437547.56
1,141,5,59830.55,120166.58,85024.068,425120.34
2,151,2,58793.53,58841.35,58817.44,117634.88
3,363,2,50799.69,55425.77,53112.73,106225.46


In [21]:
db.close()

# Grouping Data Lab

In [42]:
db = sqlite3.connect(r"C:\Users\nrmmw\Documents\Flatiron\Repos\Phase_2\dsc-grouping-data-with-sql-lab\babe_ruth.db")
pd.read_sql("""
SELECT *
FROM babe_ruth_stats;
""", db)

Unnamed: 0,id,year,team,league,doubles,triples,hits,HR,games,runs,RBI,at_bats,BB,SB,SO,AVG
0,1,1914,BOS,AL,1,0,2,0,5,1,2,10,0,0,4,0.2
1,2,1915,BOS,AL,10,1,29,4,42,16,21,92,9,0,23,0.315
2,3,1916,BOS,AL,5,3,37,3,67,18,15,136,10,0,23,0.272
3,4,1917,BOS,AL,6,3,40,2,52,14,12,123,12,0,18,0.325
4,5,1918,BOS,AL,26,11,95,11,95,50,66,317,58,6,58,0.3
5,6,1919,BOS,AL,34,12,139,29,130,103,114,432,101,7,58,0.322
6,7,1920,NY,AL,36,9,172,54,142,158,137,458,150,14,80,0.376
7,8,1921,NY,AL,44,16,204,59,152,177,171,540,145,17,81,0.378
8,9,1922,NY,AL,24,8,128,35,110,94,99,406,84,2,80,0.315
9,10,1923,NY,AL,45,13,205,41,152,151,131,522,170,17,93,0.393


In [25]:
# Return the total number of years 
# that Babe Ruth played professional baseball
pd.read_sql("""
SELECT COUNT(*) AS total_seasons_played
FROM babe_ruth_stats;
""", db)

Unnamed: 0,total_seasons_played
0,22


In [34]:
# Return the total number of years Babe Ruth played with the NY Yankees 
# (i.e. where the team value is "NY").
pd.read_sql("""
SELECT team, COUNT(*) as years_played
FROM babe_ruth_stats
WHERE team = "NY";
""", db)

Unnamed: 0,team,years_played
0,NY,15


In [35]:
# Return the row with the most HR that Babe Ruth hit in one season.
pd.read_sql("""
SELECT *, MAX(HR) AS most_hr
FROM babe_ruth_stats;
""", db)

Unnamed: 0,id,year,team,league,doubles,triples,hits,HR,games,runs,RBI,at_bats,BB,SB,SO,AVG,most_hr
0,14,1927,NY,AL,29,8,192,60,151,158,164,540,137,7,89,0.356,60


In [36]:
# Select the row with the least number of HR hit in one season.
pd.read_sql("""
SELECT *, MIN(HR) AS least_hr
FROM babe_ruth_stats;
""", db)

Unnamed: 0,id,year,team,league,doubles,triples,hits,HR,games,runs,RBI,at_bats,BB,SB,SO,AVG,least_hr
0,1,1914,BOS,AL,1,0,2,0,5,1,2,10,0,0,4,0.2,0


In [37]:
# Return the total number of HR hit by Babe Ruth during his career.
pd.read_sql("""
SELECT SUM(HR) AS total_hr
FROM babe_ruth_stats;
""", db)

Unnamed: 0,total_hr
0,714


In [50]:
# Five Worst HR Seasons With at Least 100 Games Played
# Except that where he scored 0 home runs
pd.read_sql("""
SELECT *
FROM babe_ruth_stats
WHERE games > 100 AND HR != 0
ORDER BY HR ASC
LIMIT 5;
""", db)

Unnamed: 0,id,year,team,league,doubles,triples,hits,HR,games,runs,RBI,at_bats,BB,SB,SO,AVG
0,21,1934,NY,AL,17,4,105,22,125,78,84,365,104,1,63,0.288
1,6,1919,BOS,AL,34,12,139,29,130,103,114,432,101,7,58,0.322
2,20,1933,NY,AL,21,3,138,34,137,97,103,459,114,4,90,0.301
3,9,1922,NY,AL,24,8,128,35,110,94,99,406,84,2,80,0.315
4,10,1923,NY,AL,45,13,205,41,152,151,131,522,170,17,93,0.393


In [52]:
# Select the average, AVG, of Ruth's batting averages. 
# The header of the result would be AVG(AVG) which is quite confusing
# so provide an alias of career_average.
pd.read_sql("""
SELECT AVG(AVG) AS career_average
FROM babe_ruth_stats;
""", db)

Unnamed: 0,career_average
0,0.322864


In [55]:
# Years with Over 300 Times On Base
pd.read_sql("""
SELECT year, hits + bb AS on_base
FROM babe_ruth_stats
WHERE on_base > 300;
""", db)

Unnamed: 0,year,on_base
0,1920,322
1,1921,349
2,1923,375
3,1924,342
4,1926,328
5,1927,329
6,1928,310
7,1930,322
8,1931,327


In [61]:
# Total Years and Hits Per Team
pd.read_sql("""
SELECT team,
    COUNT(year) AS num_seasons,
    SUM(hits) AS total_hits
FROM babe_ruth_stats
GROUP BY team;
""", db)

Unnamed: 0,team,num_seasons,total_hits
0,BOS,7,355
1,NY,15,2518


In [62]:
# Teams with More than 10 Seasons
pd.read_sql("""
SELECT team,
    COUNT(year) AS num_seasons,
    SUM(hits) AS total_hits
FROM babe_ruth_stats
GROUP BY team
HAVING num_seasons > 10;
""", db)

Unnamed: 0,team,num_seasons,total_hits
0,NY,15,2518


In [68]:
# Team with Highest Average At Bats
pd.read_sql("""
SELECT team, 
    AVG(at_bats) AS average_at_bats
FROM babe_ruth_stats
GROUP BY team
ORDER BY average_at_bats DESC
LIMIT 1;
""", db)

Unnamed: 0,team,average_at_bats
0,NY,481.133333


In [72]:
# Teams with Average At Bats Over 100
pd.read_sql("""
SELECT team, 
    AVG(at_bats) AS average_at_bats
FROM babe_ruth_stats
GROUP BY team
HAVING average_at_bats > 100
ORDER BY average_at_bats DESC;
""", db)

Unnamed: 0,team,average_at_bats
0,NY,481.133333
1,BOS,168.857143
