In [None]:
# Python extension for interfacing with SQL and better table formatting with Pandas

#!pip install ipython-sql
#!pip install pandas

In [None]:
# Necessary in the Jupyter Notebook to load the SQL extension and connect to the database file, currently using SQLite.
# Also formatting the SQL query outputs into a better format with Pandas

%load_ext sql
%sql sqlite:///chinook.db
%config SqlMagic.autopandas=True

1. Which tracks appeared in the most playlists? how many playlist did they appear in?

In [236]:
%%sql
-- Top 10 Tracks by Playlist Count
SELECT tracks.Name AS "Track Name", COUNT(*) AS "Playlist Count"
FROM playlist_track
JOIN playlists ON playlist_track.PlaylistId = playlists.PlaylistId
JOIN tracks on playlist_track.TrackId = tracks.TrackId
GROUP BY tracks.TrackId
ORDER BY 2 DESC
LIMIT 10;

 * sqlite:///chinook.db
Done.


Unnamed: 0,Track Name,Playlist Count
0,Intoitus: Adorate Deum,5
1,"Miserere mei, Deus",5
2,"Aria Mit 30 Veränderungen, BWV 988 ""Goldberg V...",5
3,"Suite for Solo Cello No. 1 in G Major, BWV 100...",5
4,"The Messiah: Behold, I Tell You a Mystery... T...",5
5,Solomon HWV 67: The Arrival of the Queen of Sheba,5
6,Symphony No.5 in C Minor: I. Allegro con brio,5
7,Ave Maria,5
8,"Nabucco: Chorus, ""Va, Pensiero, Sull'ali Dorate""",5
9,Die Walküre: The Ride of the Valkyries,5


2. Which track generated the most revenue? which album? which genre?

In [237]:
%%sql
-- Checking if Quantity is ever greater than 1
SELECT *
FROM invoice_items
ORDER BY Quantity DESC
LIMIT 5;

 * sqlite:///chinook.db
Done.


Unnamed: 0,InvoiceLineId,InvoiceId,TrackId,UnitPrice,Quantity
0,1,1,2,0.99,1
1,2,1,4,0.99,1
2,3,2,6,0.99,1
3,4,2,8,0.99,1
4,5,2,10,0.99,1


We can see that each invoice can have multiple different tracks. The Quantity column shows that each track could be purchased multiple times per invoice but it never is greater than 1. We can also make sure that are no times where an InvoiceId, TrackId combo shows up more than once. This is done below. This is so that we can know for certain that each track is only in an invoice at most once.

In [238]:
%%sql
-- Checking if InvoiceId/TrackId is a composite key
SELECT InvoiceId, TrackId, COUNT(*) AS count
FROM invoice_items
GROUP BY InvoiceId, TrackId
HAVING COUNT(*) > 1;

 * sqlite:///chinook.db
Done.


In [239]:
%%sql
--Top 10 Tracks by Revenue
SELECT tracks.Name AS "Track Name", SUM(invoice_items.UnitPrice) AS "Total Revenue", COUNT(*) AS "Times Sold"
FROM tracks
JOIN invoice_items ON tracks.TrackId = invoice_items.TrackId
GROUP BY invoice_items.TrackId
ORDER By 2 DESC
LIMIT 10;


 * sqlite:///chinook.db
Done.


Unnamed: 0,Track Name,Total Revenue,Times Sold
0,The Woman King,3.98,2
1,The Fix,3.98,2
2,Walkabout,3.98,2
3,Hot Girl,3.98,2
4,Gay Witch Hunt,3.98,2
5,Phyllis's Wedding,3.98,2
6,How to Stop an Exploding Man,3.98,2
7,Pilot,3.98,2
8,Occupation / Precipice,1.99,1
9,"Exodus, Pt. 1",1.99,1


In [240]:
%%sql
-- Top 10 Albums by Revenue
SELECT albums.Title AS "Album Title", SUM(invoice_items.UnitPrice) AS "Total Revenue", COUNT(*) AS "Times Sold"
FROM albums
JOIN tracks ON albums.AlbumId = tracks.AlbumId
JOIN invoice_items ON tracks.TrackId = invoice_items.TrackId
GROUP BY tracks.AlbumId
ORDER BY 2 DESC
LIMIT 10;

 * sqlite:///chinook.db
Done.


Unnamed: 0,Album Title,Total Revenue,Times Sold
0,"Battlestar Galactica (Classic), Season 1",35.82,18
1,"The Office, Season 3",31.84,16
2,Minha Historia,26.73,27
3,"Heroes, Season 1",25.87,13
4,"Lost, Season 2",25.87,13
5,Greatest Hits,25.74,26
6,Unplugged,24.75,25
7,"Battlestar Galactica, Season 3",23.88,12
8,"Lost, Season 3",21.89,11
9,Acústico,21.78,22


In [235]:
%%sql
-- Top 10 Genres by Revenue
SELECT genres.Name AS "Genre Name", SUM(invoice_items.UnitPrice) AS "Total Revenue", COUNT(*) AS "Times Sold"
FROM genres
JOIN tracks ON genres.GenreId = tracks.GenreId
JOIN invoice_items ON tracks.TrackId = invoice_items.TrackId
GROUP BY tracks.GenreId
ORDER BY 2 DESC
LIMIT 10;

 * sqlite:///chinook.db
Done.


Unnamed: 0,Genre Name,Total Revenue,Times Sold
0,Rock,826.65,835
1,Latin,382.14,386
2,Metal,261.36,264
3,Alternative & Punk,241.56,244
4,TV Shows,93.53,47
5,Jazz,79.2,80
6,Blues,60.39,61
7,Drama,57.71,29
8,R&B/Soul,40.59,41
9,Classical,40.59,41


3. Which countries have the highest sales revenue? What percent of total revenue does each country make up?

In [241]:
%%sql
PRAGMA table_info(invoices);

 * sqlite:///chinook.db
Done.


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,InvoiceId,INTEGER,1,,1
1,1,CustomerId,INTEGER,1,,0
2,2,InvoiceDate,DATETIME,1,,0
3,3,BillingAddress,NVARCHAR(70),0,,0
4,4,BillingCity,NVARCHAR(40),0,,0
5,5,BillingState,NVARCHAR(40),0,,0
6,6,BillingCountry,NVARCHAR(40),0,,0
7,7,BillingPostalCode,NVARCHAR(10),0,,0
8,8,Total,"NUMERIC(10,2)",1,,0


In [126]:
%%sql
-- Countries by Revenue and % Total
SELECT BillingCountry, SUM(Total) AS "Total Revenue", COUNT(*) AS "Total Sales in Country", ROUND(SUM(Total) / 
    (SELECT SUM(Total) FROM invoices), 3) * 100 AS "Country Revenue %"
FROM invoices
GROUP BY BillingCountry
ORDER BY 2 DESC;

 * sqlite:///chinook.db
Done.


Unnamed: 0,BillingCountry,Total Revenue,Total Sales in Country,Country Revenue %
0,USA,523.06,91,22.5
1,Canada,303.96,56,13.1
2,France,195.1,35,8.4
3,Brazil,190.1,35,8.2
4,Germany,156.48,28,6.7
5,United Kingdom,112.86,21,4.8
6,Czech Republic,90.24,14,3.9
7,Portugal,77.24,14,3.3
8,India,75.26,13,3.2
9,Chile,46.62,7,2.0


4. How many customers did each employee support, what is the average revenue for each sale, and what is their total sale?

In [153]:
%%sql
-- Employees customer count, and revenue stats
SELECT employees.EmployeeId, employees.LastName, employees.FirstName, COUNT(DISTINCT customers.CustomerId) AS "Number of Customers", AVG(Total) AS "Average_Revenue", SUM(Total) AS "Total_Revenue", ROUND(SUM(Total) / 
    (
        SELECT SUM(Total) FROM invoices
    ), 3) * 100 AS "Employee Revenue %"
FROM employees
JOIN customers ON employees.EmployeeId = customers.SupportRepId
JOIN invoices ON customers.CustomerId = invoices.CustomerId
GROUP BY customers.SupportRepId
ORDER BY Total_Revenue DESC;

 * sqlite:///chinook.db
Done.


Unnamed: 0,EmployeeId,LastName,FirstName,Number of Customers,Average_Revenue,Total_Revenue,Employee Revenue %
0,3,Peacock,Jane,21,5.705753,833.04,35.8
1,4,Park,Margaret,20,5.538571,775.4,33.3
2,5,Johnson,Steve,18,5.715556,720.16,30.9


1. Do longer or shorter length albums tend to generate more revenue? 
   
   Length in this case seems to be number of tracks in album, not length in time as per hint in question on main project page. Despite this, I decided to do both since I started by time first

In [242]:
%%sql

WITH 
album_value AS (
    SELECT tracks.AlbumId, SUM(invoice_items.UnitPrice) AS "Value"
    FROM tracks
    LEFT JOIN invoice_items ON invoice_items.TrackId = tracks.TrackId
    GROUP BY AlbumId
),
album_length AS (
    SELECT tracks.AlbumId, SUM(Milliseconds) AS "Length"
    FROM tracks
    GROUP BY AlbumId
)

SELECT albums.AlbumId, albums.Title, album_value.Value, album_length.Length
FROM albums
JOIN album_value ON albums.AlbumId = album_value.AlbumId
JOIN album_length ON albums.AlbumId = album_length.AlbumId
ORDER BY album_length.Length DESC
Limit 50;


 * sqlite:///chinook.db
Done.


Unnamed: 0,AlbumId,Title,Value,Length
0,229,"Lost, Season 3",21.89,70665582
1,253,"Battlestar Galactica (Classic), Season 1",35.82,70213784
2,230,"Lost, Season 1",19.9,64854936
3,231,"Lost, Season 2",25.87,63289631
4,228,"Heroes, Season 1",25.87,59780268
5,227,"Battlestar Galactica, Season 3",23.88,52787041
6,261,"LOST, Season 4",13.93,39468433
7,251,"The Office, Season 3",31.84,38317095
8,250,"The Office, Season 2",11.94,28636206
9,141,Greatest Hits,25.74,15065731


From looking at the longest 50 albums, it does seem that the ones that are the longest sell really well but beyond that the length of the album in time does not seem to be a major factor, albeit a possible one.

In [None]:
%%sql
WITH 
album_value AS (
    SELECT tracks.AlbumId, SUM(invoice_items.UnitPrice) AS "Value"
    FROM tracks
    LEFT JOIN invoice_items ON invoice_items.TrackId = tracks.TrackId
    GROUP BY AlbumId
),
album_length AS (
    SELECT tracks.AlbumId, COUNT(*) AS "NumTracks"
    FROM tracks
    GROUP BY AlbumId
)

SELECT ROUND(AVG(album_value.Value), 2) AS "Average Revenue", album_length.NumTracks
FROM albums
JOIN album_value ON albums.AlbumId = album_value.AlbumId
JOIN album_length ON albums.AlbumId = album_length.AlbumId
GROUP BY album_length.NumTracks
ORDER BY album_length.NumTracks DESC;

The number of songs in an album seems like a much better indicator of the album making more revenue

6. Is the number of times a track appear in any playlist a good indicator of sales?

In [257]:
%%sql
WITH playlist_counts AS (
    SELECT TrackId, COUNT(*) AS "PlaylistCount"
    FROM playlist_track
    GROUP BY TrackId
),
track_value AS (
    SELECT TrackId, SUM(UnitPrice) AS "Value"
    FROM invoice_items
    GROUP BY TrackId
)
SELECT AVG(track_value.Value) AS "Average Revenue", playlist_counts.PlaylistCount
FROM playlist_counts
LEFT JOIN track_value ON playlist_counts.TrackId = track_value.TrackId
GROUP BY playlist_counts.PlaylistCount;

 * sqlite:///chinook.db
Done.


Unnamed: 0,Average Revenue,PlaylistCount
0,1.220596,2
1,1.114929,3
2,1.144688,4
3,1.131429,5


There's not a whole lot of differing numbers of times a track appears in a playlist to draw any real conclusions. The average revenues for the different counts we have are all pretty close together.

7. How much revenue is generated each year, and what is its percent change from the previous year?

In [272]:
%%sql
WITH cur_year AS (
    SELECT CAST(strftime("%Y", InvoiceDate) AS INT) AS Year, SUM(Total) AS "YearRevenue"
    FROM invoices
    GROUP BY Year
),
prev_year AS (
    SELECT CAST(strftime("%Y", InvoiceDate) AS INT)+1 AS "Year", SUM(TOTAL) AS "YearRevenue"
    FROM invoices
    GROUP BY Year
)
SELECT cur_year.Year, cur_year.YearRevenue, ROUND(((cur_year.YearRevenue - prev_year.YearRevenue)/prev_year.YearRevenue) * 100, 2) AS "PercentChange"
FROM cur_year
LEFT JOIN prev_year ON cur_year.Year = prev_year.Year 

 * sqlite:///chinook.db
Done.


Unnamed: 0,Year,YearRevenue,PercentChange
0,2009,449.46,
1,2010,481.45,7.12
2,2011,469.58,-2.47
3,2012,477.53,1.69
4,2013,450.58,-5.64
