# SQL : IMDB Project

In [15]:
import pandas as pd
import sqlite3

In [16]:
db = "/data/notebook_files/movies.sqlite"

connection = sqlite3.connect(db)

In [17]:
query = """
SELECT name FROM sqlite_master WHERE type='table'
;
"""

df = pd.read_sql_query(query, connection)
df

Unnamed: 0,name
0,movies
1,sqlite_sequence
2,directors


**Question 1** : Find top 10 directors with the most number of movies

In [18]:
query = """
SELECT
    d.name,
    COUNT(*) total_movies
FROM
    movies m
JOIN
    directors d
ON 
    m.director_id = d.id
GROUP BY
    1
ORDER BY
    2 DESC
LIMIT 
    10

;
"""

df = pd.read_sql_query(query, connection)
df

Unnamed: 0,name,total_movies
0,Steven Spielberg,27
1,Woody Allen,21
2,Martin Scorsese,20
3,Clint Eastwood,20
4,Spike Lee,16
5,Robert Rodriguez,16
6,Ridley Scott,16
7,Steven Soderbergh,15
8,Renny Harlin,15
9,Tim Burton,14


**Question 2** : How many male and female directors ?

In [19]:
query = """
SELECT
    CASE
        WHEN gender = 1 THEN "Male"
        ELSE "Female"
    END sex,
    COUNT(*) total
FROM 
    directors
GROUP BY
1
          

;
"""

df = pd.read_sql_query(query, connection)
df

Unnamed: 0,sex,total
0,Female,2199
1,Male,150


**Question 3** : How many movies from Steven ?

In [20]:
query = """
SELECT
    d.name,
    COUNT(*) total_movies
FROM
    movies m
JOIN
    directors d
ON 
    m.director_id = d.id
WHERE
    d.name LIKE "Steven%"
GROUP BY
    1
ORDER BY 
    2 DESC
         

;
"""

df = pd.read_sql_query(query, connection)
df

Unnamed: 0,name,total_movies
0,Steven Spielberg,27
1,Steven Soderbergh,15
2,Steven Brill,4
3,Steven Zaillian,2
4,Steven Shainberg,2
5,Steven Quale,2
6,Steven Seagal,1
7,Steven R. Monroe,1
8,Steven E. de Souza,1


**Question 4** : Are there any movies segmetation ?

In [21]:
query = """
SELECT
    CASE
        WHEN budget >= 100000000 THEN "High budget"
        WHEN budget BETWEEN 10000000 AND 99999999 THEN "Upper medium budget"
        WHEN budget BETWEEN 1000000 AND 9999999 THEN "Medium budget"
        ELSE "Low budget"
    END budget_type,
    COUNT(*) total_movies
FROM movies
GROUP by 1
ORDER BY 2

;
"""

df = pd.read_sql_query(query, connection)
df

Unnamed: 0,budget_type,total_movies
0,High budget,320
1,Medium budget,820
2,Low budget,1199
3,Upper medium budget,2434


**Question 5** : What is the highest voted movies of each director ?

In [22]:
query = """
WITH sub1 AS (
SELECT
    d.name,
    m.title,
    m.vote_average score,
    DENSE_RANK() OVER(PARTITION BY d.id ORDER BY m.vote_average DESC) rank
FROM
    movies m
JOIN
    directors d
ON
    m.director_id = d.id)

SELECT
    name,
    title,
    score
FROM
    sub1
WHERE 
    rank = 1
;
"""

df = pd.read_sql_query(query, connection)
df

Unnamed: 0,name,title,score
0,James Cameron,Terminator 2: Judgment Day,7.7
1,James Cameron,Aliens,7.7
2,Gore Verbinski,Pirates of the Caribbean: The Curse of the Bla...,7.5
3,Sam Mendes,American Beauty,7.9
4,Christopher Nolan,The Dark Knight,8.2
...,...,...,...
2404,Shane Carruth,Primer,6.9
2405,Neill Dela Llana,Cavite,7.5
2406,Scott Smith,"Signed, Sealed, Delivered",7.0
2407,Daniel Hsia,Shanghai Calling,5.7


**Question 6** : Who has voted score more than average and directed more than 5 movies ?

In [23]:
query = """
SELECT
    d.name,
    ROUND(AVG(m.vote_average),2) as vote_score,
    COUNT(*)
FROM
    movies m
JOIN 
    directors d
ON
    m.director_id = d.id
GROUP BY
    1
HAVING 
    COUNT(*) >= 5 AND
    AVG(m.vote_average) >= (SELECT AVG(vote_average) FROM movies )
ORDER BY
    3 DESC



;
"""

df = pd.read_sql_query(query, connection)
df

Unnamed: 0,name,vote_score,COUNT(*)
0,Steven Spielberg,6.97,27
1,Woody Allen,6.69,21
2,Martin Scorsese,7.30,20
3,Clint Eastwood,6.87,20
4,Spike Lee,6.46,16
...,...,...,...
145,Bill Condon,6.36,5
146,Andrew Niccol,6.62,5
147,Andrew Adamson,6.62,5
148,Alexander Payne,6.88,5


**Question 7** : Who directed a movie for 3 years in a row?

In [24]:
query = """
WITH sub1 AS (SELECT
    d.name,
    m.title,
    CAST(STRFTIME("%Y", m.release_date) AS INT) year
FROM   
    movies m
JOIN 
    directors d
ON
    m.director_id = d.id),

sub2 AS(SELECT
    *,
    LEAD(year) OVER(PARTITION BY name ORDER BY year) year_1,
    LEAD(year,2) OVER(PARTITION BY name ORDER BY year) year_2
FROM
    sub1)

SELECT 
   DISTINCT name
FROM
    sub2
WHERE 
    year_2 = year_1+1 AND
    year_1 = year +1
  





;
"""

df = pd.read_sql_query(query, connection)
df

Unnamed: 0,name
0,Adam Shankman
1,Antoine Fuqua
2,Barry Levinson
3,Brett Ratner
4,Brian Robbins
5,Clint Eastwood
6,Darren Lynn Bousman
7,Francis Lawrence
8,Gore Verbinski
9,Jason Friedberg


**Question 8** : What are the highest popular movies in each year

In [30]:

query = """
WITH sub1 AS (
SELECT
    STRFTIME("%Y", release_date) as year,
    title,
    popularity,
    DENSE_RANK() OVER(PARTITION BY STRFTIME("%Y", release_date) ORDER BY popularity DESC) as rank
FROM
    movies)

SELECT 
    year,
    title,
    popularity,
    rank
FROM 
    sub1
WHERE
    rank = 1
;
"""


df = pd.read_sql_query(query, connection)
df

Unnamed: 0,year,title,popularity,rank
0,1916,Intolerance,3,1
1,1925,The Big Parade,0,1
2,1927,Metropolis,32,1
3,1929,Pandora's Box,1,1
4,1930,Hell's Angels,8,1
...,...,...,...,...
87,2013,Frozen,165,1
88,2014,Interstellar,724,1
89,2015,Minions,875,1
90,2016,Deadpool,514,1
