In [1]:
import numpy as np
import pandas as pd 
import seaborn as sns
import sqlite3 as sql
import matplotlib.pyplot as pl
from pandasql import sqldf

# Films file

In [2]:
films = pd.read_csv('films_copy.csv',index_col=0)

In [3]:
films_sql = '''SELECT * 
FROM films;'''
sqldf(films_sql,env=None)

Unnamed: 0,id,title,release_year,country,duration,language,certification,gross,budget
0,1,Intolerance: Love's Struggle Throughout the Ages,1916.0,USA,123.0,,Not Rated,,385907.0
1,2,Over the Hill to the Poorhouse,1920.0,USA,110.0,,,3000000.0,100000.0
2,3,The Big Parade,1925.0,USA,151.0,,Not Rated,,245000.0
3,4,Metropolis,1927.0,Germany,145.0,German,Not Rated,26435.0,6000000.0
4,5,Pandora's Box,1929.0,Germany,110.0,German,Not Rated,9950.0,
...,...,...,...,...,...,...,...,...,...
4963,4964,Unforgotten,,UK,45.0,English,,,
4964,4965,Wings,,USA,30.0,English,,,
4965,4966,Wolf Creek,,Australia,,English,,,
4966,4967,Wuthering Heights,,UK,142.0,English,,,


# Reviews file

In [4]:
reviews = pd.read_csv('reviews_copy.csv', index_col=0)

In [5]:
reviews_sql = '''SELECT * 
FROM reviews;'''
sqldf(reviews_sql,env=None)

Unnamed: 0,film_id,num_user,num_critic,imdb_score,num_votes,facebook_likes
0,3405,285.0,267.0,6.4,149998,0
1,478,65.0,29.0,3.2,8465,491
2,74,83.0,25.0,7.6,7071,930
3,1254,1437.0,224.0,8.0,241030,13000
4,740,111.0,64.0,6.4,64742,0
...,...,...,...,...,...,...
4962,4801,2.0,6.0,7.0,75,121
4963,4264,514.0,488.0,7.0,181472,58000
4964,4356,85.0,119.0,6.2,29738,12000
4965,430,118.0,38.0,5.9,29591,0


# People file

In [6]:
people = pd.read_csv('people_copy.csv',index_col=0)

In [7]:
p = '''SELECT * 
FROM people;'''
sqldf(p,env=None)

Unnamed: 0,id,name,birthdate,deathdate
0,1,50 Cent,1975-07-06,
1,2,A. Michael Baldwin,1963-04-04,
2,3,A. Raven Cruz,,
3,4,A.J. Buckley,1978-02-09,
4,5,A.J. DeLucia,,
...,...,...,...,...
8392,8393,Zohra Segal,1912-04-27,2014-07-10
8393,8394,Zooey Deschanel,1980-01-17,
8394,8395,Zoran Lisinac,,
8395,8396,Zubaida Sahar,,


# Roles file

In [8]:
roles = pd.read_csv('roles_copy.csv',index_col=0)

In [9]:
r = '''SELECT * 
FROM roles;'''
sqldf(r,env=None)

Unnamed: 0,id,film_id,person_id,role
0,2,1,4843,actor
1,3,1,5050,actor
2,4,1,8175,actor
3,5,2,3000,director
4,6,2,4019,actor
...,...,...,...,...
19785,19787,4966,6623,actor
19786,19788,4967,3240,actor
19787,19789,4967,4524,actor
19788,19790,4967,7886,actor


# Sorting data and results

# Sorting single fields
Now that you understand how ORDER BY works, you'll put it into practice. In this exercise, you'll work on sorting single fields only. This can be helpful to extract quick insights such as the top-grossing or top-scoring film.

The following exercises will help you gain further insights into the film database.

In [10]:
# Select name from people and sort alphabetically
names_alphabetically = '''
SELECT name
FROM people
ORDER BY(name = 'ASC');'''
sqldf(names_alphabetically,env=None)

Unnamed: 0,name
0,50 Cent
1,A. Michael Baldwin
2,A. Raven Cruz
3,A.J. Buckley
4,A.J. DeLucia
...,...
8392,Zohra Segal
8393,Zooey Deschanel
8394,Zoran Lisinac
8395,Zubaida Sahar


In [11]:
# Select the title and duration from longest to shortest film
duration_longest_to_shortest = '''
SELECT title, duration 
FROM films
ORDER BY duration DESC;'''
sqldf(duration_longest_to_shortest,env=None)

Unnamed: 0,title,duration
0,Carlos,334.0
1,"Blood In, Blood Out",330.0
2,Heaven's Gate,325.0
3,The Legend of Suriyothai,300.0
4,Das Boot,293.0
...,...,...
4963,Barfi,
4964,Destiny,
4965,Karachi se Lahore,
4966,Romantic Schemer,


- Superb sorting! ORDER BY is another simple yet effective way to gain intelligence about your business and data. You now know how to extract your best and worst-performing assets with only a few lines of code.

# Sorting multiple fields
ORDER BY can also be used to sort on multiple fields. It will sort by the first field specified, then sort by the next, and so on. As an example, you may want to sort the people data by age and keep the names in alphabetical order.

Try using ORDER BY to sort multiple columns.

In [12]:
# Select the release year, duration, and title sorted by release year and duration
release_year_and_duration = '''
SELECT release_year, duration, title
FROM films
ORDER BY release_year, duration;'''
sqldf(release_year_and_duration,env=None)

Unnamed: 0,release_year,duration,title
0,,,Wolf Creek
1,,22.0,"10,000 B.C."
2,,22.0,Anger Management
3,,24.0,Lovesick
4,,24.0,Yu-Gi-Oh! Duel Monsters
...,...,...,...
4963,2016.0,144.0,13 Hours
4964,2016.0,144.0,X-Men: Apocalypse
4965,2016.0,147.0,Captain America: Civil War
4966,2016.0,156.0,The Wailing


In [13]:
# Select the certification, release year, and title sorted by certification and release year
certification_release_year = '''
SELECT certification, release_year, title
FROM films
ORDER BY certification, release_year;'''
sqldf(certification_release_year,env=None)

Unnamed: 0,certification,release_year,title
0,,,"10,000 B.C."
1,,,A Touch of Frost
2,,,Anger Management
3,,,Animal Kingdom
4,,,BrainDead
...,...,...,...
4963,X,1986.0,The Texas Chainsaw Massacre 2
4964,X,1987.0,A Nightmare on Elm Street 3: Dream Warriors
4965,X,1987.0,Evil Dead II
4966,X,1989.0,A Nightmare on Elm Street 5: The Dream Child


- Nicely done! The second column you order on only steps in when the first column has been ordered.

# Grouping data

# GROUP BY single fields
GROUP BY is a SQL keyword that allows you to group and summarize results with the additional use of aggregate functions. For example, films can be grouped by the certification and language before counting the film titles in each group. This allows you to see how many films had a particular certification and language grouping.

In the following steps, you'll summarize other groups of films to learn more about the films in your database.

In [14]:
# Find the release_year and film_count of each year
film_count = '''
SELECT release_year, COUNT(*) AS film_count
FROM films
GROUP BY release_year;'''
sqldf(film_count,env=None)

Unnamed: 0,release_year,film_count
0,,42
1,1916.0,1
2,1920.0,1
3,1925.0,1
4,1927.0,1
...,...,...
87,2012.0,220
88,2013.0,236
89,2014.0,252
90,2015.0,226


In [15]:
# Find the release_year and average duration of films for each year
avg_duration = '''
SELECT release_year, AVG(duration) AS avg_duration
FROM films
GROUP BY release_year;'''
sqldf(avg_duration,env=None)

Unnamed: 0,release_year,avg_duration
0,,77.439024
1,1916.0,123.000000
2,1920.0,110.000000
3,1925.0,151.000000
4,1927.0,145.000000
...,...,...
87,2012.0,106.000000
88,2013.0,108.140426
89,2014.0,105.426295
90,2015.0,106.098214


- Great job grouping! Using GROUP BY with a time or date field such as release_year can help us identify trends such as a period of time where movies were really short!

# GROUP BY multiple fields
GROUP BY becomes more powerful when used across multiple fields or combined with ORDER BY and LIMIT.\
Perhaps you're interested in learning about budget changes throughout the years in individual countries. You'll use grouping in this exercise to look at the maximum budget for each country in each year there is data available.

In [16]:
# Find the release_year, country, and max_budget, then group and order by release_year and country
max_budget = '''
SELECT release_year, country, MAX(budget) AS max_budget
FROM films
GROUP BY release_year, country;'''
sqldf(max_budget,env=None)

Unnamed: 0,release_year,country,max_budget
0,,,
1,,Australia,15000000.0
2,,Canada,
3,,France,
4,,Iceland,
...,...,...,...
500,2016.0,Mexico,3000000.0
501,2016.0,Panama,20000000.0
502,2016.0,South Korea,12620000.0
503,2016.0,UK,175000000.0


- Well done! You can see how building on your SQL queries helps you gain more insights and detect trends in the data, such as how film budgets may change throughout the years.

In [17]:
# Which release_year had the most language diversity?
diversity_language = '''
SELECT release_year, COUNT(DISTINCT language) AS language_count
FROM films
GROUP BY release_year;'''
sqldf(diversity_language,env=None)

Unnamed: 0,release_year,language_count
0,,5
1,1916.0,0
2,1920.0,0
3,1925.0,0
4,1927.0,1
...,...,...
87,2012.0,8
88,2013.0,13
89,2014.0,9
90,2015.0,15


- Well done! The year 2006 had 16 distinct languages, that's more than any other year.

# Filtering grouped data

# Filter with HAVING
Your final keyword is HAVING. It works similarly to WHERE in that it is a filtering clause, with the difference that HAVING filters grouped data\
Filtering grouped data can be especially handy when working with a large dataset. When working with thousands or even millions of rows, HAVING will allow you to filter for just the group of data you want, such as films over two hours in length!\
Practice using HAVING to find out which countries (or country) have the most varied film certifications.

In [18]:
# Select the country and distinct count of certification as certification_count
# Group by country
# Filter results to countries with more than 10 different certifications
certification_count = '''
SELECT country, COUNT(DISTINCT certification) AS certification_count
FROM films
GROUP BY country
HAVING COUNT(DISTINCT certification) > 10;'''
sqldf(certification_count,env=None)

Unnamed: 0,country,certification_count
0,USA,12


- Great job! The answer is USA with 12 different certifications.

# HAVING and sorting
Filtering and sorting go hand in hand and gives you greater interpretability by ordering our results.\
Let's see this magic at work by writing a query showing what countries have the highest average film budgets.

In [24]:
# Select the country and average_budget from films
# Group by country
# Filter to countries with an average_budget of more than one billion
# Order by descending order of the aggregated budget
average_budget = '''
SELECT country, AVG(budget) AS average_budget
FROM films
GROUP BY country
HAVING AVG(budget) > 1000000000
ORDER BY average_budget DESC;'''
sqldf(average_budget,env=None)

Unnamed: 0,country,average_budget
0,South Korea,1383960000.0
1,Hungary,1260000000.0


- You did it! South Korea and Hungary seem to have pricey films... or do they? Actually, these budgets are pretty standard for their local currency.

# All together now
It's time to use much of what you've learned in one query! This is good preparation for using SQL in the real world where you'll often be asked to write more complex queries since some of the basic queries can be answered by playing around in spreadsheet applications.\
In this exercise, you'll write a query that returns the average budget and gross earnings for films each year after 1990 if the average budget is greater than 60 million.\
This will be a big query, but you can handle it!

In [25]:
# Select the release_year for films released after 1990 grouped by year
released_after_1990 = '''
SELECT title, release_year 
FROM films
WHERE release_year > 1990
GROUP BY release_year;'''
sqldf(released_after_1990,env=None)

Unnamed: 0,title,release_year
0,Beastmaster 2: Through the Portal of Time,1991.0
1,A Few Good Men,1992.0
2,Airborne,1993.0
3,3 Ninjas Kick Back,1994.0
4,Ace Ventura: When Nature Calls,1995.0
5,A Thin Line Between Love and Hate,1996.0
6,24 7: Twenty Four Seven,1997.0
7,20 Dates,1998.0
8,10 Things I Hate About You,1999.0
9,102 Dalmatians,2000.0


In [27]:
# Modify the query to also list the average budget and average gross
avg_budget_and_avg_gross = '''SELECT title, release_year, 
AVG(budget) AS avg_budget, 
AVG(gross) AS avg_gross
FROM films
WHERE release_year > 1990
GROUP BY release_year;'''
sqldf(avg_budget_and_avg_gross,env=None)

Unnamed: 0,title,release_year,avg_budget,avg_gross
0,Beastmaster 2: Through the Portal of Time,1991.0,25176550.0,53844500.0
1,A Few Good Men,1992.0,25982030.0,63665200.0
2,Airborne,1993.0,20729790.0,45302090.0
3,3 Ninjas Kick Back,1994.0,29013770.0,59395670.0
4,Ace Ventura: When Nature Calls,1995.0,32775000.0,44909520.0
5,A Thin Line Between Love and Hate,1996.0,31620610.0,42044170.0
6,24 7: Twenty Four Seven,1997.0,59424490.0,44793770.0
7,20 Dates,1998.0,40460000.0,38377010.0
8,10 Things I Hate About You,1999.0,38981780.0,38072180.0
9,102 Dalmatians,2000.0,34931380.0,42172630.0


In [28]:
# Modify the query to see only years with an avg_budget of more than 60 million
avg_budget_and_avg_gross_60 = '''
SELECT title, release_year, AVG(budget) AS avg_budget, AVG(gross) AS avg_gross
FROM films
WHERE release_year > 1990
GROUP BY release_year
HAVING AVG(budget) > 60000000;'''
sqldf(avg_budget_and_avg_gross_60,env=None)

Unnamed: 0,title,release_year,avg_budget,avg_gross
0,51 Birch Street,2005.0,70323940.0,41159140.0
1,10th & Wolf,2006.0,93968930.0,39237860.0


In [31]:
# Order the results from highest to lowest average gross and limit to one
avg_budget_and_avg_gross_limit_1 = '''
SELECT title, release_year, AVG(budget) AS avg_budget, AVG(gross) AS avg_gross
FROM films
WHERE release_year > 1990
GROUP BY release_year
HAVING AVG(budget) > 60000000
ORDER BY avg_gross DESC
LIMIT 1;'''
sqldf(avg_budget_and_avg_gross_limit_1,env=None)

Unnamed: 0,title,release_year,avg_budget,avg_gross
0,51 Birch Street,2005.0,70323940.0,41159140.0


- Superb work! SQL queries can get rather long, but breaking them down into individual clauses makes them easier to write.

# What you've learned
• Chapter 1: Selecting with COUNT() , LIMIT\
• Chapter 2: Filtering with WHERE, BETWEEN, AND, OR, LIKE, NOT LIKE, IN,%, - , IS NULL, IS NOT NULL\
• Chapter 3: ROUND() and aggregate functions\
• Chapter 4: Sorting and grouping with ORDER BY, DESC, GROUP BY, HAVING\
• Comparison operators\
• Arithmetic

# Skills
• Error handling\
• Debugging\
• Writing readable code\
• Selecting data\
• Querying data\
• Filtering and summarizing data\
• Sorting and grouping data