In [11]:
# %pip install duckdb
# %pip install polars

In [12]:
import duckdb
import polars as pl


In [13]:
# --- 1. Definir URLs dos Parquets no DagsHub ---
base_url = "https://dagshub.com/Matheuskcode/Big-Data-Found/raw/main/data"

files = {
    "ratings": f"{base_url}/ratings.parquet",
    "movies": f"{base_url}/movies.parquet",
    "tags": f"{base_url}/tags.parquet"
}

In [14]:
# --- 2. Conectar ao DuckDB ---
con = duckdb.connect()

In [15]:
# --- 3. Criar views diretamente a partir dos Parquets remotos ---
con.execute(f"CREATE OR REPLACE VIEW ratings AS SELECT * FROM parquet_scan('{files['ratings']}')")
con.execute(f"CREATE OR REPLACE VIEW movies  AS SELECT * FROM parquet_scan('{files['movies']}')")
con.execute(f"CREATE OR REPLACE VIEW tags    AS SELECT * FROM parquet_scan('{files['tags']}')")

<_duckdb.DuckDBPyConnection at 0x1fe27ee85f0>

In [16]:
# --- 4. Análises Exploratórias ---

# Tamanho das tabelas
print('Tamanhos:')
display(con.execute("SELECT COUNT(*) AS n_ratings FROM ratings").df())
display(con.execute("SELECT COUNT(*) AS n_movies FROM movies").df())
display(con.execute("SELECT COUNT(*) AS n_tags FROM tags").df())



Tamanhos:


Unnamed: 0,n_ratings
0,10000054


Unnamed: 0,n_movies
0,10681


Unnamed: 0,n_tags
0,95580


In [17]:
# Distribuição de notas
print("\nDistribuição de notas:")
display(con.execute("""
    SELECT rating, COUNT(*) as freq
    FROM ratings
    GROUP BY rating
    ORDER BY rating
""").df())


Distribuição de notas:


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,rating,freq
0,0.5,94988
1,1.0,384180
2,1.5,118278
3,2.0,790306
4,2.5,370178
5,3.0,2356676
6,3.5,879764
7,4.0,2875850
8,4.5,585022
9,5.0,1544812


In [18]:
# Top 10 filmes mais avaliados
print("\nTop 10 filmes mais avaliados:")
display(con.execute("""
    SELECT m.title, COUNT(r.rating) as n_ratings, AVG(r.rating) as avg_rating
    FROM ratings r
    JOIN movies m ON r.movieId = m.movieId
    GROUP BY m.title
    ORDER BY n_ratings DESC
    LIMIT 10
""").df())



Top 10 filmes mais avaliados:


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,title,n_ratings,avg_rating
0,Pulp Fiction (1994),34864,4.157426
1,Forrest Gump (1994),34457,4.013582
2,"Silence of the Lambs, The (1991)",33668,4.2042
3,Jurassic Park (1993),32631,3.661564
4,"Shawshank Redemption, The (1994)",31126,4.457238
5,Braveheart (1995),29154,4.08239
6,"Fugitive, The (1993)",28951,4.006925
7,Terminator 2: Judgment Day (1991),28948,3.927698
8,Star Wars: Episode IV - A New Hope (a.k.a. Sta...,28566,4.220209
9,Apollo 13 (1995),27035,3.88735


In [19]:
# Top 10 filmes com melhor média (mínimo 100 avaliações)
print("\nTop 10 filmes com melhor média (>=100 avaliações):")
display(con.execute("""
    SELECT m.title, COUNT(r.rating) as n_ratings, AVG(r.rating) as avg_rating
    FROM ratings r
    JOIN movies m ON r.movieId = m.movieId
    GROUP BY m.title
    HAVING COUNT(r.rating) >= 100
    ORDER BY avg_rating DESC
    LIMIT 10
""").df())


Top 10 filmes com melhor média (>=100 avaliações):


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,title,n_ratings,avg_rating
0,"Shawshank Redemption, The (1994)",31126,4.457238
1,"Godfather, The (1972)",19814,4.415085
2,"Usual Suspects, The (1995)",24037,4.367142
3,Schindler's List (1993),25777,4.363483
4,Sunset Blvd. (a.k.a. Sunset Boulevard) (1950),3255,4.321966
5,Casablanca (1942),12507,4.319741
6,Rear Window (1954),8825,4.316544
7,Double Indemnity (1944),2403,4.315439
8,Seven Samurai (Shichinin no samurai) (1954),5751,4.314119
9,"Third Man, The (1949)",3265,4.313629


In [20]:
# Gêneros mais populares
print("\nPopularidade por gênero:")
display(con.execute("""
    SELECT genre, COUNT(*) as n_movies
    FROM (
        SELECT movieId, UNNEST(STRING_SPLIT(genres, '|')) as genre
        FROM movies
    )
    GROUP BY genre
    ORDER BY n_movies DESC
""").df())


Popularidade por gênero:


Unnamed: 0,genre,n_movies
0,Drama,5339
1,Comedy,3703
2,Thriller,1706
3,Romance,1685
4,Action,1473
5,Crime,1118
6,Adventure,1025
7,Horror,1013
8,Sci-Fi,754
9,Fantasy,543


In [21]:
# Usuários mais ativos
print("\nTop 10 usuários mais ativos:")
display(con.execute("""
    SELECT userId, COUNT(*) as n_ratings, AVG(rating) as avg_rating
    FROM ratings
    GROUP BY userId
    ORDER BY n_ratings DESC
    LIMIT 10
""").df())


Top 10 usuários mais ativos:


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,userId,n_ratings,avg_rating
0,59269,7359,3.266544
1,67385,7047,3.196608
2,14463,5169,2.404914
3,68259,4483,3.570154
4,27468,4449,3.831423
5,19635,4165,3.497359
6,3817,4165,3.111645
7,63134,3755,3.271904
8,58357,3697,3.003652
9,27584,3479,3.003449


In [22]:
# Tags mais frequentes
print("\nTop 10 tags mais usadas:")
display(con.execute("""
    SELECT LOWER(tag) as tag, COUNT(*) as freq
    FROM tags
    GROUP BY LOWER(tag)
    ORDER BY freq DESC
    LIMIT 10
""").df())


Top 10 tags mais usadas:


Unnamed: 0,tag,freq
0,classic,718
1,tumey's dvds,641
2,based on a book,555
3,r,518
4,less than 300 ratings,505
5,comedy,484
6,action,479
7,nudity (topless),466
8,70mm,464
9,dvd,433
