In [None]:
import duckdb
import pandas as pd

# --- 1. Definir URLs dos arquivos no DagsHub ---
base_url = "https://dagshub.com/Matheuskcode/Big-Data-Found/raw/main/data"

files = {
    "ratings": f"{base_url}/ratings.dat",
    "movies": f"{base_url}/movies.dat",
    "tags": f"{base_url}/tags.dat"
}

# --- 2. Conectar ao DuckDB ---
con = duckdb.connect()

# --- 3. Ler arquivos .dat direto da URL ---
ratings = con.execute(f"""
    SELECT * FROM read_csv_auto('{files['ratings']}',
        delim='::',
        columns={{'userId':'BIGINT','movieId':'BIGINT','rating':'DOUBLE','timestamp':'BIGINT'}},
        ignore_errors=true,
        quote=''
    )
""").df()

movies = con.execute(f"""
    SELECT * FROM read_csv_auto('{files['movies']}',
        delim='::',
        columns={{'movieId':'BIGINT','title':'VARCHAR','genres':'VARCHAR'}},
        ignore_errors=true,
        quote=''
    )
""").df()

tags = con.execute(f"""
    SELECT * FROM read_csv_auto('{files['tags']}',
        delim='::',
        columns={{'userId':'BIGINT','movieId':'BIGINT','tag':'VARCHAR','timestamp':'BIGINT'}},
        ignore_errors=true,
        quote=''
    )
""").df()
0
  n  
# --- 4. Salvar em Parquet ---
ratings.to_parquet("ratings.parquet", index=False)
movies.to_parquet("movies.parquet", index=False)
tags.to_parquet("tags.parquet", index=False)

# --- 5. Análise exploratória ---
print("Ratings shape:", ratings.shape)
print("Movies shape:", movies.shape)
print("Tags shape:", tags.shape)

print("\nDistribuição de notas:")
print(ratings['rating'].value_counts().sort_index())

print("\nTop 5 filmes mais avaliados:")
top_movies = ratings.groupby("movieId").size().sort_values(ascending=False).head(5)
top_movies = top_movies.reset_index().merge(movies, on="movieId")
print(top_movies[['movieId','title',0]])

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Ratings shape: (10000054, 4)
Movies shape: (10681, 3)
Tags shape: (95580, 4)

Distribuição de notas:
rating
0.5      94988
1.0     384180
1.5     118278
2.0     790306
2.5     370178
3.0    2356676
3.5     879764
4.0    2875850
4.5     585022
5.0    1544812
Name: count, dtype: int64

Top 5 filmes mais avaliados:
   movieId                             title      0
0      296               Pulp Fiction (1994)  34864
1      356               Forrest Gump (1994)  34457
2      593  Silence of the Lambs, The (1991)  33668
3      480              Jurassic Park (1993)  32631
4      318  Shawshank Redemption, The (1994)  31126


In [1]:
# Estando dentro da pasta do repositório local
!dagshub upload Matheuskcode/Big-Data-Found \
  "ratings.parquet" data/ratings.parquet --update

!dagshub upload Matheuskcode/Big-Data-Found \
  "movies.parquet" data/movies.parquet --update

!dagshub upload Matheuskcode/Big-Data-Found \
  "tags.parquet" data/tags.parquet --update

Accessing as Matheuskcode
Uploading files (1) to "Matheuskcode/Big-Data-Found"...
Upload finished successfully!
Accessing as Matheuskcode
Uploading files (1) to "Matheuskcode/Big-Data-Found"...
Upload finished successfully!
Accessing as Matheuskcode
Uploading files (1) to "Matheuskcode/Big-Data-Found"...
Upload finished successfully!


In [1]:
# Executa commit e push de todas as alterações para o GitHub
!cd "C:/Users/mathe/OneDrive/Documentos/GitHub/Data-Found-Project" && \
git add . && \
git commit -m "Atualiza repositório online" && \
git push origin main

On branch main
Your branch is up to date with 'origin/main'.

nothing to commit, working tree clean
