## EDA

In [35]:
import duckdb
from pathlib import Path

duckdb_path = "data/sakila.duckdb"
Path(duckdb_path).unlink(missing_ok=True)

with duckdb.connect(duckdb_path) as conn, open("sql/load_sakila.sql") as ingest_script:
    conn.sql(ingest_script.read())

    description = conn.sql("DESC;").df()
    films = conn.sql("FROM film;").df()

films.head(3)

Unnamed: 0,film_id,title,description,release_year,language_id,original_language_id,rental_duration,rental_rate,length,replacement_cost,rating,special_features,last_update
0,1,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,2006,1,,6,0.99,86,20.99,PG,"Deleted Scenes,Behind the Scenes",2021-03-06 15:52:00
1,2,ACE GOLDFINGER,A Astounding Epistle of a Database Administrat...,2006,1,,3,4.99,48,12.99,G,"Trailers,Deleted Scenes",2021-03-06 15:52:00
2,3,ADAPTATION HOLES,A Astounding Reflection of a Lumberjack And a ...,2006,1,,7,2.99,50,18.99,NC-17,"Trailers,Deleted Scenes",2021-03-06 15:52:00


In [79]:
description["column_names"][7]

array(['film_id', 'title', 'description', 'release_year', 'language_id',
       'original_language_id', 'rental_duration', 'rental_rate', 'length',
       'replacement_cost', 'rating', 'special_features', 'last_update'],
      dtype=object)

### Create a function to connect to sakila.duckdb and test it with assignment A - show title and length for movies longer than 3 hours:

In [37]:
def query_sakila(query: str, duckdb_path = "data/sakila.duckdb"):
    with duckdb.connect(duckdb_path) as conn:
        df = conn.sql(query=query).df()

    return df

query_sakila("""
            SELECT title, length 
            FROM film
            WHERE length > 180;
            """).head()

Unnamed: 0,title,length
0,ANALYZE HOOSIERS,181
1,BAKED CLEOPATRA,182
2,CATCH AMISTAD,183
3,CHICAGO NORTH,185
4,CONSPIRACY SPIRIT,184


### Which movies have the word "love" in its title? 

In [49]:
query_sakila("""SELECT title, rating, length, description
             FROM film
             WHERE regexp_matches(title, '(?i)(^|[^A-Za-z])LOVE([^A-Za-z]|$)');
             """).head()

Unnamed: 0,title,rating,length,description
0,GRAFFITI LOVE,PG,117,A Unbelieveable Epistle of a Sumo Wrestler And...
1,IDAHO LOVE,PG-13,172,A Fast-Paced Drama of a Student And a Crocodil...
2,INDIAN LOVE,NC-17,135,A Insightful Saga of a Mad Scientist And a Mad...
3,LAWRENCE LOVE,NC-17,175,A Fanciful Yarn of a Database Administrator An...
4,LOVE SUICIDES,R,181,A Brilliant Panorama of a Hunter And a Explore...


### Calculate the following statistics on movie length:
- shortest
- average
- median
- longest

In [69]:
query_sakila("""SELECT 
             ROUND(MIN(length), 0):: INTEGER as shortest_movie_min,
             ROUND(AVG(length), 0):: INTEGER AS average_movie_length_min,
             ROUND(MEDIAN(length), 0):: INTEGER AS median_movie_length_min,
             ROUND(MAX(length), 0):: INTEGER AS longest_movie_min
             FROM film
             """)

Unnamed: 0,shortest_movie_min,average_movie_length_min,median_movie_length_min,longest_movie_min
0,46,115,114,185


### Show the 10 most expensive movies to rent per day

In [98]:
query_sakila("""SELECT title, ROUND(rental_duration / rental_rate, 2) AS rent_per_day
             FROM film
             ORDER BY rent_per_day DESC;
             """).head(10)

Unnamed: 0,title,rent_per_day
0,GANDHI KWAI,7.07
1,SPIRIT FLINTSTONES,7.07
2,STORY SIDE,7.07
3,SHEPHERD MIDSUMMER,7.07
4,CRUELTY UNFORGIVEN,7.07
5,PRIMARY GLASS,7.07
6,UNFORGIVEN ZOOLANDER,7.07
7,CURTAIN VIDEOTAPE,7.07
8,NOTORIOUS REUNION,7.07
9,HOLLYWOOD ANONYMOUS,7.07


### Show the 10 actors who has played in most movies

In [126]:
query_sakila("""SELECT 
             a.first_name || ' ' || a.last_name AS actor,
             COUNT(DISTINCT f.film_id) AS number_of_movies
             FROM film f
                LEFT JOIN film_actor fa ON f.film_id = fa.film_id
                LEFT JOIN actor a ON a.actor_id = fa.actor_id
             GROUP BY actor
             ORDER BY number_of_movies DESC
            ;
             """).head(10)

Unnamed: 0,actor,number_of_movies
0,SUSAN DAVIS,54
1,GINA DEGENERES,42
2,WALTER TORN,41
3,MARY KEITEL,40
4,MATTHEW CARREY,39
5,SANDRA KILMER,37
6,SCARLETT DAMON,36
7,VIVIEN BASINGER,35
8,ANGELA WITHERSPOON,35
9,HENRY BERRY,35
