In [1]:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as pl

In [2]:
#This is another way to import sql(lite-version)
import sqlite3 as sql

In [3]:
from pandasql import sqldf

# Reviews_table

In [4]:
reviews = pd.read_csv('reviews_copy.csv', index_col=0)
reviews.head()

Unnamed: 0,film_id,num_user,num_critic,imdb_score,num_votes,facebook_likes
0,3405,285.0,267.0,6.4,149998,0
1,478,65.0,29.0,3.2,8465,491
2,74,83.0,25.0,7.6,7071,930
3,1254,1437.0,224.0,8.0,241030,13000
4,740,111.0,64.0,6.4,64742,0


In [6]:
query = '''SELECT * 
FROM reviews;'''
sqldf(query, env=None)

Unnamed: 0,film_id,num_user,num_critic,imdb_score,num_votes,facebook_likes
0,3405,285.0,267.0,6.4,149998,0
1,478,65.0,29.0,3.2,8465,491
2,74,83.0,25.0,7.6,7071,930
3,1254,1437.0,224.0,8.0,241030,13000
4,740,111.0,64.0,6.4,64742,0
...,...,...,...,...,...,...
4962,4801,2.0,6.0,7.0,75,121
4963,4264,514.0,488.0,7.0,181472,58000
4964,4356,85.0,119.0,6.2,29738,12000
4965,430,118.0,38.0,5.9,29591,0


# Learning to COUNT()
You saw how to use COUNT() in the video. Do you remember what it returns?\
Here is a query counting film_id. Select the answer below that correctly describes what the query will return.\
SELECT COUNT(film_id) AS count_film_id\
FROM reviews_table;\
Run the query in the console to test your theory!

In [8]:
query_1 = '''SELECT COUNT(film_id) AS count_film_id 
FROM reviews;'''
sqldf(query_1,env=None)

Unnamed: 0,count_film_id
0,4967


# People_table

# Practice with COUNT()
As you've seen, COUNT(*) tells you how many records are in a table. However, if you want to count the number of non-missing values in a particular field, you can call COUNT() on just that field.\
Let's get some practice with COUNT()! You can look at the data in the tables throughout these exercises by clicking on the table name in the console.

In [12]:
people = pd.read_csv('people_copy.csv', index_col=0)

In [13]:
query_2 = '''SELECT * 
FROM people;'''
sqldf(query_2,env=None)

Unnamed: 0,id,name,birthdate,deathdate
0,1,50 Cent,1975-07-06,
1,2,A. Michael Baldwin,1963-04-04,
2,3,A. Raven Cruz,,
3,4,A.J. Buckley,1978-02-09,
4,5,A.J. DeLucia,,
...,...,...,...,...
8392,8393,Zohra Segal,1912-04-27,2014-07-10
8393,8394,Zooey Deschanel,1980-01-17,
8394,8395,Zoran Lisinac,,
8395,8396,Zubaida Sahar,,


In [14]:
# Count the number of records in the people table
count_records = '''SELECT COUNT(*) AS count_records
FROM people;'''
sqldf(count_records,env=None)

Unnamed: 0,count_records
0,8397


In [15]:
#Count the number of birthdates in the people table
count_birthdate = '''SELECT COUNT(birthdate) AS count_birthdate 
FROM people;'''
sqldf(count_birthdate,env=None)

Unnamed: 0,count_birthdate
0,6152


# Films_table

In [16]:
films = pd.read_csv('films_copy.csv', index_col=0)

In [17]:
films_sql = '''SELECT * 
FROM films;'''
sqldf(films_sql,env=None)

Unnamed: 0,id,title,release_year,country,duration,language,certification,gross,budget
0,1,Intolerance: Love's Struggle Throughout the Ages,1916.0,USA,123.0,,Not Rated,,385907.0
1,2,Over the Hill to the Poorhouse,1920.0,USA,110.0,,,3000000.0,100000.0
2,3,The Big Parade,1925.0,USA,151.0,,Not Rated,,245000.0
3,4,Metropolis,1927.0,Germany,145.0,German,Not Rated,26435.0,6000000.0
4,5,Pandora's Box,1929.0,Germany,110.0,German,Not Rated,9950.0,
...,...,...,...,...,...,...,...,...,...
4963,4964,Unforgotten,,UK,45.0,English,,,
4964,4965,Wings,,USA,30.0,English,,,
4965,4966,Wolf Creek,,Australia,,English,,,
4966,4967,Wuthering Heights,,UK,142.0,English,,,


In [18]:
#Count the records for languages and countries represented in the films table
count_lang_country = '''SELECT COUNT(language) AS count_languages,
COUNT(country) AS count_countries
FROM films;'''
sqldf(count_lang_country,env=None)

Unnamed: 0,count_languages,count_countries
0,4955,4966


- Tres Bien! Looking at the differences between the count of unique values, total values, and all records can provide useful insights into your data.

# SELECT DISTINCT
Often query results will include many duplicate values. You can use the DISTINCT keyword to select the unique values from a field.\
This might be useful if, for example, you're interested in knowing which languages are represented in the films table. See if you can find out what countries are represented in this table with the following exercises.

In [19]:
unique_countries = '''SELECT DISTINCT country
FROM films;'''
sqldf(unique_countries,env=None)

Unnamed: 0,country
0,USA
1,Germany
2,Japan
3,Denmark
4,UK
...,...
60,Kenya
61,Slovenia
62,Pakistan
63,Chile


In [20]:
count_distinct_countries = '''SELECT COUNT(DISTINCT country) AS count_distinct_countries
FROM films;'''
sqldf(count_distinct_countries,env=None)

Unnamed: 0,count_distinct_countries
0,64


- Congratulations! Using DISTINCT is a great tool to see the unique values of a dataset. This table has 64 unique countries.

# Query execution

FROM SELECT LIMIT;
- Congratulations! This is the correct order of execution. It makes sense that SQL needs to know where to SELECT data FROM before it can LIMIT the results.

# Debugging errors
Debugging is an essential skill for all coders, and it comes from making many mistakes and learning from them.\
In this exercise, you'll be given some buggy code that you'll need to fix.

In [21]:
# Debug this code
films_certification = '''SELECT certification
FROM films
LIMIT 5;'''
sqldf(films_certification,env=None)

Unnamed: 0,certification
0,Not Rated
1,
2,Not Rated
3,Not Rated
4,Not Rated


In [22]:
# Debug this code
film_id_imdb_score_num_votes = '''SELECT film_id,imdb_score,num_votes
FROM reviews;'''
sqldf(film_id_imdb_score_num_votes,env=None)

Unnamed: 0,film_id,imdb_score,num_votes
0,3405,6.4,149998
1,478,3.2,8465
2,74,7.6,7071
3,1254,8.0,241030
4,740,6.4,64742
...,...,...,...
4962,4801,7.0,75
4963,4264,7.0,181472
4964,4356,6.2,29738
4965,430,5.9,29591


In [23]:
#Debug this code
count_birthdays = '''SELECT COUNT(birthdate) AS count_birthdays
FROM people;'''
sqldf(count_birthdays,env=None)

Unnamed: 0,count_birthdays
0,6152


- Excellent extermination of those bugs! This is an important skill that will come in very handy.

# SQL style

# Formatting
Readable code is highly valued in the coding community and professional settings. Without proper formatting, code and results can be difficult to interpret. You'll often be working with other people that need to understand your code or be able to explain your results, so having a solid formatting habit is essential.\
In this exercise, you'll correct poorly written code to better adhere to SQL style standards.

In [24]:
roles = pd.read_csv('roles_copy.csv', index_col=0)

In [25]:
roles_sql = '''SELECT * 
FROM roles;'''
sqldf(roles_sql,env=None)

Unnamed: 0,id,film_id,person_id,role
0,2,1,4843,actor
1,3,1,5050,actor
2,4,1,8175,actor
3,5,2,3000,director
4,6,2,4019,actor
...,...,...,...,...
19785,19787,4966,6623,actor
19786,19788,4967,3240,actor
19787,19789,4967,4524,actor
19788,19790,4967,7886,actor


In [26]:
person_id_role = '''SELECT person_id, role
FROM roles;'''
sqldf(person_id_role,env=None)

Unnamed: 0,person_id,role
0,4843,actor
1,5050,actor
2,8175,actor
3,3000,director
4,4019,actor
...,...,...
19785,6623,actor
19786,3240,actor
19787,4524,actor
19788,7886,actor


# Non-standard fields
You may occasionally receive a dataset with poorly named fields. Ideally, you would fix these, but you can work around it with some added punctuation in this instance.\
A sample query and schema have been provided; imagine you need to be able to run it with a non-standard field name. Select the multiple-choice answer that would correctly fill in the blank to return both a film's id and its number of Facebook likes for all reviews:

In [27]:
facebook_likes = '''SELECT film_id, "facebook likes"
FROM reviews;'''
sqldf(facebook_likes,env=None)

Unnamed: 0,film_id,"""facebook likes"""
0,3405,facebook likes
1,478,facebook likes
2,74,facebook likes
3,1254,facebook likes
4,740,facebook likes
...,...,...
4962,4801,facebook likes
4963,4264,facebook likes
4964,4356,facebook likes
4965,430,facebook likes


- Correct! Using double quotes around a non-standard name allows us to run the SQL query.