**Average Rating over time using startYear**

| **Version** | **Execution Time** | **Optimization Strategies Utilized** |
|----------|----------|----------|
| Naive | 334 seconds | Naive solution, utilized group by and order by with the AVG function on (averageRating) |
| First Update | 3.438 seconds | First optimization, utilized WHERE statements to filter null data, genres, and years |
| Optimized | 3.329 seconds (no difference) | Final optimized version, indexes were added on (genreID, tconst), (startYear, averageRating), and (genreName) |

Naive 

In [None]:
%%sql

SELECT  startYear, avg(averageRating), genreName
FROM title_ft
JOIN title_genre_bridge ON title_ft.tconst = title_genre_bridge.tconst
JOIN genre_dt ON title_genre_bridge.genreID = genre_dt.genreID
GROUP BY startYear, genreName
ORDER BY startYear ASC, genreName;

First Optimization

In [None]:
%%sql

SELECT tf.startYear,
       AVG(tf.averageRating) AS avg_rating,
       gd.genreName
FROM title_ft tf
JOIN title_genre_bridge tgb ON tf.tconst = tgb.tconst
JOIN genre_dt gd ON tgb.genreID = gd.genreID
WHERE tf.startYear IS NOT NULL AND averageRating IS NOT NULL AND gd.genreName = 'Action' AND tf.startYear > 1999 AND tf.startYear < 2020
GROUP BY tf.startYear, gd.genreName
ORDER BY tf.startYear, gd.genreName;

Final Optimization With Indexes

In [None]:
%%sql 

ALTER TABLE title_genre_bridge
  ADD INDEX idx_bridge_genreid_tconst (genreID, tconst);
  
ALTER TABLE title_ft
  ADD INDEX idx_tf_startYear_avgRating (startYear, averageRating);  

ALTER TABLE genre_dt
  ADD INDEX idx_genre_genrename (genreName);
  
SELECT tf.startYear,
       AVG(tf.averageRating) AS avg_rating,
       gd.genreName
FROM title_ft tf
JOIN title_genre_bridge tgb ON tf.tconst = tgb.tconst
JOIN genre_dt gd ON tgb.genreID = gd.genreID
WHERE tf.startYear IS NOT NULL AND averageRating IS NOT NULL AND gd.genreName = 'Action' AND tf.startYear > 1999 AND tf.startYear < 2020
GROUP BY tf.startYear, gd.genreName
ORDER BY tf.startYear, gd.genreName;

=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=





**Average numVotes over timer**

| **Version** | **Execution Time** | **Optimization Strategies Utilized** |
|----------|----------|----------|
| Naive | 340 seconds | Naive solution, utilized group by and order by with the AVG function on (numVotes) |
| First Update | 3.485 seconds | First optimization, utilized WHERE statements to filter null data, genres, and years |
| Optimized | 2.765 seconds (minor difference) | Final optimized version, indexes were added on (genreID, tconst), (startYear, numVotes), and (genreName) |

Naive

In [None]:
%%sql

SELECT  startYear, avg(numVotes), genreName
FROM title_ft
JOIN title_genre_bridge ON title_ft.tconst = title_genre_bridge.tconst
JOIN genre_dt ON title_genre_bridge.genreID = genre_dt.genreID
GROUP BY startYear, genreName
ORDER BY startYear ASC, genreName; 

First Optimization

In [None]:
%%sql

SELECT tf.startYear, AVG(tf.numVotes) AS avg_votes, gd.genreName
FROM title_genre_bridge tgb
JOIN title_ft tf ON tgb.tconst = tf.tconst
JOIN genre_dt gd ON tgb.genreID = gd.genreID
WHERE tf.startYear IS NOT NULL
	AND gd.genreName = 'Fantasy'
    AND tf.startYear < 2022
    AND tf.startYear > 2002
GROUP BY tf.startYear, gd.genreName
ORDER BY tf.startYear, gd.genreName;

Final Optimization With Indexes

In [None]:
%%sql

ALTER TABLE genre_dt
  ADD INDEX idx_genre_genrename (genreName);

ALTER TABLE title_genre_bridge
  ADD INDEX idx_bridge_genreid_tconst (genreID, tconst);

ALTER TABLE title_ft
  ADD INDEX idx_tf_startYear_numVotes (startYear, numVotes);
  
SELECT tf.startYear, AVG(tf.numVotes) AS avg_votes, gd.genreName
FROM title_genre_bridge tgb
JOIN title_ft tf ON tgb.tconst = tf.tconst
JOIN genre_dt gd ON tgb.genreID = gd.genreID
WHERE tf.startYear IS NOT NULL
	AND gd.genreName = 'Fantasy'
    AND tf.startYear < 2022
    AND tf.startYear > 2002
GROUP BY tf.startYear, gd.genreName
ORDER BY tf.startYear, gd.genreName;


=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=





**Proportion share of all genres over decade periods**

| **Version** | **Execution Time** | **Optimization Strategies Utilized** |
|----------|----------|----------|
| Naive | 347 seconds | Naive solution, utilized group by and order by with the FLOOR and COUNT functions on (startYear) and (*) respectively|
| First Update | 8.935 seconds | First optimization, utilized WHERE statements to filter null data, genres, and years |
| Optimized | 6.578 seconds (minor difference) | Final optimized version, indexes were added on (genreID, tconst), (startYear), and (genreName) |

Naive

In [None]:
%%sql


select s.decade, d.genreName, d.co, s.tot, (d.co / s.tot) as prop
from
(SELECT  (FLOOR(startYear / 10) * 10) AS decade, count(*) as tot -- total per decade
FROM imdb.title_ft
group by decade
order by decade) as s
join (
SELECT  (FLOOR(startYear / 10) * 10) AS decade, count(*) as co, genre_dt.genreName -- total per decade per genre
FROM imdb.title_ft
JOIN title_genre_bridge ON title_ft.tconst = title_genre_bridge.tconst
JOIN genre_dt ON title_genre_bridge.genreID = genre_dt.genreID
group by decade, genreName
order by decade) as d
ON s.decade = d.decade
order by s.decade;

First Optimization

In [None]:
%%sql

select s.decade, d.genreName, d.co, s.tot, (d.co / s.tot) as prop
from
(SELECT  (FLOOR(startYear / 10) * 10) AS decade, count(*) as tot -- total per decade
FROM imdb.title_ft
WHERE startYear is not null 
	AND startYear > 1999
    AND startYear < 2021
    
group by decade
order by decade) as s
join (
SELECT  (FLOOR(startYear / 10) * 10) AS decade, count(*) as co, genre_dt.genreName -- total per decade per genre
FROM imdb.title_ft
JOIN title_genre_bridge ON title_ft.tconst = title_genre_bridge.tconst
JOIN genre_dt ON title_genre_bridge.genreID = genre_dt.genreID
WHERE startYear is not null
	AND genreName = 'Action'
group by decade, genreName
order by decade) as d
ON s.decade = d.decade
order by s.decade;

Final Optimization With Indexes

In [None]:
%%sql

ALTER TABLE genre_dt
  ADD INDEX idx_genre_genrename (genreName);

ALTER TABLE title_genre_bridge
  ADD INDEX idx_bridge_genreid_tconst (genreID, tconst);

ALTER TABLE imdb.title_ft
  ADD INDEX idx_tf_startYear (startYear);

select s.decade, d.genreName, d.co, s.tot, (d.co / s.tot) as prop
from
(SELECT  (FLOOR(startYear / 10) * 10) AS decade, count(*) as tot -- total per decade
FROM imdb.title_ft
WHERE startYear is not null 
	AND startYear > 1999
    AND startYear < 2021
    
group by decade
order by decade) as s
join (
SELECT  (FLOOR(startYear / 10) * 10) AS decade, count(*) as co, genre_dt.genreName -- total per decade per genre
FROM imdb.title_ft
JOIN title_genre_bridge ON title_ft.tconst = title_genre_bridge.tconst
JOIN genre_dt ON title_genre_bridge.genreID = genre_dt.genreID
WHERE startYear is not null
	AND genreName = 'Action'
group by decade, genreName
order by decade) as d
ON s.decade = d.decade
order by s.decade;

=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=





**Top movies of time period**

| **Version** | **Execution Time** | **Optimization Strategies Utilized** |
|----------|----------|----------|
| Naive | 912 seconds | Naive solution, utilized GROUP BY and ORDER BY with the MAX function on (averageRating) inside a subquery |
| First Update | 23.281 seconds | First optimization, utilized PARTITION BY on (startYear) and ROW_NUMBER to get the highest rated movie. WHERE statements to filter out null data and number of votes |
| Optimized | 23.484 seconds (no difference) | Final optimized version, indexes were added on (genreID, tconst) and (startYear, averageRating, and numVotes) |

Naive

In [None]:
%%sql

SELECT 
  t.startYear,
  t.primaryTitle,
  t.averageRating AS highest,
  gd.genreName
FROM title_genre_bridge tgb
JOIN title_ft t ON tgb.tconst = t.tconst
JOIN genre_dt gd ON tgb.genreID = gd.genreID
WHERE (t.startYear, t.averageRating) IN (
  SELECT startYear, MAX(averageRating)
  FROM title_ft
  GROUP BY startYear
)
ORDER BY t.startYear;

First Optimization

In [None]:
%%sql

WITH ranked_titles AS (
  SELECT
    t.tconst,
    t.startYear,
    t.primaryTitle,
    t.averageRating,
    ROW_NUMBER() OVER (
      PARTITION BY t.startYear
      ORDER BY t.averageRating DESC
    ) AS rnk
  FROM title_ft t
  WHERE t.averageRating IS NOT NULL
    AND t.numVotes > 1000
)
SELECT
  rt.startYear,
  rt.primaryTitle,
  rt.averageRating AS highest,
  gd.genreName
FROM ranked_titles rt
JOIN title_genre_bridge tgb ON rt.tconst = tgb.tconst
JOIN genre_dt gd ON tgb.genreID = gd.genreID
WHERE rt.rnk = 1
ORDER BY rt.startYear;

Final Optimization With Indexes

In [None]:
%%sql

CREATE INDEX idx_title_ft_startyear_rating_votes
ON title_ft (startYear, averageRating DESC, numVotes);

CREATE INDEX idx_tgb_tconst_genreid
ON title_genre_bridge (tconst, genreID);

WITH ranked_titles AS (
  SELECT
    t.tconst,
    t.startYear,
    t.primaryTitle,
    t.averageRating,
    ROW_NUMBER() OVER (
      PARTITION BY t.startYear
      ORDER BY t.averageRating DESC
    ) AS rnk
  FROM title_ft t
  WHERE t.averageRating IS NOT NULL
    AND t.numVotes > 1000
)
SELECT
  rt.startYear,
  rt.primaryTitle,
  rt.averageRating AS highest,
  gd.genreName
FROM ranked_titles rt
JOIN title_genre_bridge tgb ON rt.tconst = tgb.tconst
JOIN genre_dt gd ON tgb.genreID = gd.genreID
WHERE rt.rnk = 1
ORDER BY rt.startYear;

=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=





**Average runtimeMinutes over startYear**

| **Version** | **Execution Time** | **Optimization Strategies Utilized** |
|----------|----------|----------|
| Naive | 330 seconds | Naive solution, utilized GROUP BY and ORDER BY with the AVG function on (runtimeMinutes) |
| First Update | 9.297 seconds | First optimization, utilized WHERE statements to filter out null data, genres, and years|
| Optimized | 3.047 seconds (major difference) | Final optimized version, indexes were added on (genreID, tconst), (startYear, runtimeMinutes, and tconst), and (genreName, genreID) |

Naive

In [None]:
%%sql

explain SELECT startYear, avg(runtimeMinutes), genreName
FROM title_ft
JOIN title_genre_bridge ON title_ft.tconst = title_genre_bridge.tconst
JOIN genre_dt ON title_genre_bridge.genreID = genre_dt.genreID
GROUP BY startYear, genreName
ORDER BY startYear ASC, genreName; 

First Optimization

In [None]:
%%sql

SELECT
  tf.startYear,
  AVG(tf.runtimeMinutes) AS avg_runtime,
  gd.genreName
FROM title_genre_bridge tgb
JOIN title_ft tf ON tgb.tconst = tf.tconst
JOIN genre_dt  gd ON tgb.genreID = gd.genreID
WHERE tf.startYear IS NOT NULL
  AND tf.runtimeMinutes IS NOT NULL
  AND tf.startYear < 2020
  AND tf.startYear > 2000
  AND gd.genreName = 'Action'
GROUP BY tf.startYear, gd.genreName
ORDER BY tf.startYear, gd.genreName;

Final Optimization With Indexes

In [None]:
%%sql

CREATE INDEX idx_title_ft_startYear_runtime_tconst 
ON title_ft (startYear, runtimeMinutes, tconst);

CREATE INDEX idx_tgb_tconst_genreID 
ON title_genre_bridge (tconst, genreID);

CREATE INDEX idx_genreName_genreID
ON genre_dt (genreName, genreID);

SELECT
  tf.startYear,
  AVG(tf.runtimeMinutes) AS avg_runtime,
  gd.genreName
FROM title_genre_bridge tgb
JOIN title_ft tf ON tgb.tconst = tf.tconst
JOIN genre_dt  gd ON tgb.genreID = gd.genreID
WHERE tf.startYear IS NOT NULL
  AND tf.runtimeMinutes IS NOT NULL
  AND tf.startYear < 2020
  AND tf.startYear > 2000
  AND gd.genreName = 'Action'
GROUP BY tf.startYear, gd.genreName
ORDER BY tf.startYear, gd.genreName;

=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=





**Filter by type**

| **Version** | **Execution Time** | **Optimization Strategies Utilized** |
|----------|----------|----------|
| Naive | 279 seconds | Naive solution, utilized GROUP BY and ORDER BY with the COUNT function on (typeID)|
| First Update | 9.297 seconds | First optimization, utilized WHERE statements to filter out null data, genres, and types |
| Optimized | 3.0219 seconds (major difference) | Final optimized version, indexes were added on (genreID, tconst), and (typeID, tconst)|

Naive

In [None]:
%%sql

SELECT typeID, count(typeID), genreName
FROM title_ft
JOIN title_genre_bridge ON title_ft.tconst = title_genre_bridge.tconst
JOIN genre_dt
ON title_genre_bridge.genreID = genre_dt.genreID
GROUP BY typeID, genreName
ORDER BY typeID ASC, genreName; 

First Optimization

In [None]:
%%sql

SELECT
  tf.typeID,
  COUNT(*)            AS cnt,
  gd.genreName
FROM title_genre_bridge tgb
JOIN title_ft tf ON tgb.tconst = tf.tconst
JOIN genre_dt  gd ON tgb.genreID = gd.genreID
WHERE typeID is not null
	AND genreName = 'Action'
    AND typeID = '1'
GROUP BY tf.typeID, gd.genreName
ORDER BY tf.typeID, gd.genreName;


Final Optimization With Indexes

In [None]:
%%sql

CREATE INDEX idx_title_ft_typeid_tconst
ON title_ft (typeID, tconst);

CREATE INDEX idx_tgb_tconst_genreid
ON title_genre_bridge (tconst, genreID);

SELECT
  tf.typeID,
  COUNT(*)            AS cnt,
  gd.genreName
FROM title_genre_bridge tgb
JOIN title_ft tf ON tgb.tconst = tf.tconst
JOIN genre_dt  gd ON tgb.genreID = gd.genreID
WHERE typeID is not null
	AND genreName = 'Action'
    AND typeID = '1'
GROUP BY tf.typeID, gd.genreName
ORDER BY tf.typeID, gd.genreName;