In [11]:
!pip install pandas pyarrow fastparquet



In [12]:
import pandas as pd
import pyarrow
from pathlib import Path

In [13]:
def normalizarFechas(series):
    dt = pd.to_datetime(series, errors="coerce")
    return dt.dt.strftime("%Y-%m-%d").fillna("")

def convertirANumerico(series):
    return pd.to_numeric(series, errors="coerce")

In [14]:
INPUT_CSV  = Path("../../Evaluaciones/EV02/movies_metadata.csv")  # ruta al CSV original
OUTPUT_CSV = Path("./movies_metadata_final.csv")  # salida UTF-8
OUTPUT_PARQUET = Path("./movies_metadata_final.parquet")  # para Athena

In [15]:
pd.set_option('display.max_columns', None)  
pd.set_option('display.max_colwidth', 70)   

In [16]:
df = pd.read_csv(INPUT_CSV, low_memory=False, encoding="utf-8", dtype=str)
df.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', 'poster_path': '/7G9...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his room until Andy's bi...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States of America'}]",1995-10-30,373554033,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an enchanted board game that...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'name': 'Teitler Film',...","[{'iso_3166_1': 'US', 'name': 'United States of America'}]",1995-12-15,262797249,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso_639_1': 'fr', 'name...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413


In [17]:
df["production_companies"]

0                               [{'name': 'Pixar Animation Studios', 'id': 3}]
1        [{'name': 'TriStar Pictures', 'id': 559}, {'name': 'Teitler Film',...
2        [{'name': 'Warner Bros.', 'id': 6194}, {'name': 'Lancaster Gate', ...
3              [{'name': 'Twentieth Century Fox Film Corporation', 'id': 306}]
4        [{'name': 'Sandollar Productions', 'id': 5842}, {'name': 'Touchsto...
                                         ...                                  
45461                                                                       []
45462                                   [{'name': 'Sine Olivia', 'id': 19653}]
45463                        [{'name': 'American World Pictures', 'id': 6165}]
45464                                     [{'name': 'Yermoliev', 'id': 88753}]
45465                                                                       []
Name: production_companies, Length: 45466, dtype: object

In [18]:
columnasUtiles = [
    "id",                    
    "original_title",        
    "release_date",          
    "budget",                
    "revenue",               
    "popularity",            
    "vote_average",
    "vote_count",          
    "production_companies",  
    "genres"                 
]

In [19]:
df_trucado = df[columnasUtiles].copy()
columnasDescartadas = [c for c in df.columns if c not in columnasUtiles]
print("Columnas descartadas:", len(columnasDescartadas))
print("Columnas finales:", len(df_trucado.columns))

Columnas descartadas: 14
Columnas finales: 10


In [20]:

df_trucado["release_date"] = pd.to_datetime(df_trucado["release_date"], errors="coerce")
df_trucado["release_date"] = df_trucado["release_date"].dt.strftime("%Y-%m-%d").fillna("")

for col in ["budget", "revenue", "popularity", "vote_average"]:
    df_trucado[col] = pd.to_numeric(df_trucado[col], errors="coerce")

df_trucado["id"] = df_trucado["id"].astype(str).str.strip()

df_trucado.head(3)

Unnamed: 0,id,original_title,release_date,budget,revenue,popularity,vote_average,vote_count,production_companies,genres
0,862,Toy Story,1995-10-30,30000000.0,373554033.0,21.946943,7.7,5415,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'..."
1,8844,Jumanji,1995-12-15,65000000.0,262797249.0,17.015539,6.9,2413,"[{'name': 'TriStar Pictures', 'id': 559}, {'name': 'Teitler Film',...","[{'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {..."
2,15602,Grumpier Old Men,1995-12-22,0.0,0.0,11.7129,6.5,92,"[{'name': 'Warner Bros.', 'id': 6194}, {'name': 'Lancaster Gate', ...","[{'id': 10749, 'name': 'Romance'}, {'id': 35, 'name': 'Comedy'}]"


In [21]:
print("Filas:", len(df_trucado))
print("Tipos:")
print(df_trucado.dtypes)

print("\nNulos por columna:")
print(df_trucado.isna().sum())


Filas: 45466
Tipos:
id                       object
original_title           object
release_date             object
budget                  float64
revenue                 float64
popularity              float64
vote_average            float64
vote_count               object
production_companies     object
genres                   object
dtype: object

Nulos por columna:
id                      0
original_title          0
release_date            0
budget                  3
revenue                 6
popularity              6
vote_average            6
vote_count              6
production_companies    3
genres                  0
dtype: int64


In [23]:
# Guardar CSV
df_trucado.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")
print("CSV guardado en:", OUTPUT_CSV.resolve())

# Guardar Parquet (sin encoding ya que Parquet maneja la codificación internamente)
df_trucado.to_parquet(OUTPUT_PARQUET, index=False, engine='fastparquet')  
print("Parquet guardado en:", OUTPUT_PARQUET.resolve())

CSV guardado en: /home/matti/Documents/UDD/ADB/Entregas/MattiasMorales_EV02/movies_metadata_final.csv
Parquet guardado en: /home/matti/Documents/UDD/ADB/Entregas/MattiasMorales_EV02/movies_metadata_final.parquet
