In [None]:
import pandas as pd

# Cargar el diccionario de datos
diccionario_path = "/mnt/data/Diccionario de Datos - PIMLOps.xlsx"
xls = pd.ExcelFile(diccionario_path)

# Ver las hojas disponibles
xls.sheet_names

In [None]:
# Cargar la hoja "movies" del diccionario de datos
df_diccionario_movies = pd.read_excel(xls, sheet_name="movies")

# Mostrar las primeras filas para analizar la estructura
df_diccionario_movies.head()


In [None]:
import numpy as np

# Crear un dataset de ejemplo con la estructura adecuada
data_movies = {
    "belongs_to_collection": [
        {"id": 10194, "name": "Toy Story Collection"}, 
        None, 
        {"id": 87118, "name": "Batman Collection"}
    ],
    "budget": [30000000, 150000000, np.nan],  # Incluye un NaN para ver su reemplazo
    "genres": [
        [{"id": 16, "name": "Animation"}, {"id": 35, "name": "Comedy"}],
        [{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}],
        [{"id": 878, "name": "Science Fiction"}]
    ],
    "homepage": ["http://toystory.disney.com/toy-story", None, None],
    "id": [1, 2, 3],
    "imdb_id": ["tt0114709", "tt0468569", "tt0848228"],
    "original_title": ["Toy Story", "The Dark Knight", "The Avengers"],
    "overview": [
        "A cowboy doll is profoundly threatened when a new spaceman figure supplants him as top toy in a boy's room.",
        "When the menace known as the Joker emerges from his mysterious past, he wreaks havoc and chaos.",
        "Earth's mightiest heroes must come together and learn to fight as a team."
    ],
    "popularity": [21.946943, 35.000123, 50.256789],
    "poster_path": ["/rhIRbceoE9lR4veEXuwCC2wARtG.jpg", "/qJ2tW6WMUDux911r6m7haRef0WH.jpg", "/cezWGskPY5x7GaglTTRN4Fugfb8.jpg"],
    "production_companies": [
        [{"id": 3, "name": "Pixar Animation Studios"}],
        [{"id": 429, "name": "Warner Bros. Pictures"}],
        [{"id": 420, "name": "Marvel Studios"}]
    ],
    "release_date": ["1995-11-22", np.nan, "2012-04-25"],  # Incluye un NaN para ver su eliminación
    "revenue": [373554033, 1004558444, np.nan],  # Incluye un NaN para ver su reemplazo
    "runtime": [81, 152, 143],
    "status": ["Released", "Released", "Released"],
    "tagline": ["", "Why So Serious?", "Some assembly required."],
    "title": ["Toy Story", "The Dark Knight", "The Avengers"],
    "video": [False, False, False],
    "vote_average": [8.3, 8.4, 7.7],
    "vote_count": [5415, 16789, 23456],
}

# Crear DataFrame
df_movies = pd.DataFrame(data_movies)

# Aplicar transformaciones según lo solicitado
# 1. Desanidar algunos campos complejos
df_movies["belongs_to_collection"] = df_movies["belongs_to_collection"].apply(lambda x: x["name"] if isinstance(x, dict) else None)
df_movies["production_companies"] = df_movies["production_companies"].apply(lambda x: ", ".join([d["name"] for d in x]) if isinstance(x, list) else None)

# 2. Reemplazar NaN en revenue y budget con 0
df_movies["revenue"] = df_movies["revenue"].fillna(0)
df_movies["budget"] = df_movies["budget"].fillna(0)

# 3. Eliminar filas con release_date nulo
df_movies = df_movies.dropna(subset=["release_date"])

# 4. Crear la columna release_year
df_movies["release_year"] = df_movies["release_date"].str[:4].astype(int)

# 5. Calcular el retorno de inversión
df_movies["return"] = df_movies.apply(lambda row: row["revenue"] / row["budget"] if row["budget"] > 0 else 0, axis=1)

# 6. Eliminar columnas no utilizadas
df_movies = df_movies.drop(columns=["video", "imdb_id", "original_title", "poster_path", "homepage"])

# Mostrar el dataset transformado
df_movies


In [None]:
import pandas as pd

# Cargar el dataset
df = pd.read_csv("movies_dataset.csv")

# Ver las primeras filas
df.head()


In [None]:
df.info()


In [None]:
df.describe()

df.describe(include="object")

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.histplot(df["revenue"], bins=30, kde=True)
plt.title("Distribución de Revenue")
plt.show()



In [None]:
df["status"].value_counts().plot(kind="bar")
plt.title("Cantidad de Películas por Estado")
plt.show()


In [None]:
from collections import Counter
import ast

genres = df["genres"].dropna().apply(ast.literal_eval)  # Convertir strings a listas de diccionarios
genre_list = [g["name"] for sublist in genres for g in sublist]
genre_counts = Counter(genre_list)

sns.barplot(x=list(genre_counts.keys()), y=list(genre_counts.values()))
plt.xticks(rotation=90)
plt.title("Películas por Género")
plt.show()
