### Desanidar columna "production_companies"

In [52]:
# Importar librerías
import pandas as pd
import ast
import numpy as np

In [53]:
# Lee el archivo CSV y lo almacena en un DataFrame de Pandas
df = pd.read_csv("../../production.csv")
df.head()

Unnamed: 0,id,production_companies,production_countries
0,862,"[{""name"": ""Pixar Animation Studios"", ""id"": 3}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o..."
1,8844,"[{""name"": ""TriStar Pictures"", ""id"": 559}, {""na...","[{""iso_3166_1"": ""US"", ""name"": ""United States o..."
2,15602,"[{""name"": ""Warner Bros."", ""id"": 6194}, {""name""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o..."
3,31357,"[{""name"": ""Twentieth Century Fox Film Corporat...","[{""iso_3166_1"": ""US"", ""name"": ""United States o..."
4,11862,"[{""name"": ""Sandollar Productions"", ""id"": 5842}...","[{""iso_3166_1"": ""US"", ""name"": ""United States o..."


In [54]:
# Verificar el tipo de datos de cada columna
tipos_de_datos = df.dtypes

# Mostrar los tipos de datos de cada columna
print(tipos_de_datos)

id                       int64
production_companies    object
production_countries    object
dtype: object


In [55]:
# Verificar el tipo de datos de la columna "id"
is_int64 = df['id'].dtype == np.int64

if is_int64:
    print("Todos los valores de la columna 'id' son int64")
else:
    print("Algunos valores de la columna 'id' no son int64")

Todos los valores de la columna 'id' son int64


In [56]:
is_int64 = df['id'].apply(lambda x: isinstance(x, np.int64)).all()

if is_int64:
    print("Todos los valores de la columna 'id' son int64")
else:
    print("Algunos valores de la columna 'id' no son int64")

Algunos valores de la columna 'id' no son int64


In [57]:
# Convertir la columna 'id' a tipo numérico int64 y eliminar filas no convertibles
df['id'] = pd.to_numeric(df['id'], errors='coerce', downcast='integer')
df = df.dropna(subset=['id'])

# Verificar si todas las filas de la columna 'id' son de tipo int64
if df['id'].dtype == np.int64:
    print("Todos los valores de la columna 'id' son int64")
else:
    print("Algunas filas fueron eliminadas y los valores restantes en la columna 'id' son int64")

Algunas filas fueron eliminadas y los valores restantes en la columna 'id' son int64


In [58]:
# Guardar df en nueva variable
new_df = df[['id', 'production_companies']].copy()

In [59]:
# Mostrar las primeras filas del DataFrame
new_df.head()

Unnamed: 0,id,production_companies
0,862,"[{""name"": ""Pixar Animation Studios"", ""id"": 3}]"
1,8844,"[{""name"": ""TriStar Pictures"", ""id"": 559}, {""na..."
2,15602,"[{""name"": ""Warner Bros."", ""id"": 6194}, {""name""..."
3,31357,"[{""name"": ""Twentieth Century Fox Film Corporat..."
4,11862,"[{""name"": ""Sandollar Productions"", ""id"": 5842}..."


In [60]:
# Función para convertir cadenas en listas de diccionarios
def parse_companies(x):
    """
    Convierte una cadena en una lista de diccionarios utilizando ast.literal_eval().

    Args:
        x (str): Cadena a convertir.

    Returns:
        list: Lista de diccionarios resultante.

    """
    try:
        return ast.literal_eval(x)
    except (SyntaxError, ValueError):
        return []

# Convertir las cadenas de texto en la columna "production_companies" en listas de diccionarios
df['production_companies'] = df['production_companies'].apply(parse_companies)

# Crear columnas 'id', 'pc_id' y 'pc_name' en new_df
new_df['id'] = df['id']
new_df['pc_id'] = df['production_companies'].apply(lambda x: ', '.join(str(pc['id']) for pc in x))
new_df['pc_name'] = df['production_companies'].apply(lambda x: ', '.join(pc['name'] for pc in x))

In [61]:
# Mostrar las primeras filas del DataFrame
new_df.head()

Unnamed: 0,id,production_companies,pc_id,pc_name
0,862,"[{""name"": ""Pixar Animation Studios"", ""id"": 3}]",3,Pixar Animation Studios
1,8844,"[{""name"": ""TriStar Pictures"", ""id"": 559}, {""na...","559, 2550, 10201","TriStar Pictures, Teitler Film, Interscope Com..."
2,15602,"[{""name"": ""Warner Bros."", ""id"": 6194}, {""name""...","6194, 19464","Warner Bros., Lancaster Gate"
3,31357,"[{""name"": ""Twentieth Century Fox Film Corporat...",306,Twentieth Century Fox Film Corporation
4,11862,"[{""name"": ""Sandollar Productions"", ""id"": 5842}...","5842, 9195","Sandollar Productions, Touchstone Pictures"


In [62]:
# Eliminar columna production_companies
new_df = new_df.drop('production_companies', axis=1)

In [63]:
# Mostrar las primeras filas del DataFrame
new_df.head()

Unnamed: 0,id,pc_id,pc_name
0,862,3,Pixar Animation Studios
1,8844,"559, 2550, 10201","TriStar Pictures, Teitler Film, Interscope Com..."
2,15602,"6194, 19464","Warner Bros., Lancaster Gate"
3,31357,306,Twentieth Century Fox Film Corporation
4,11862,"5842, 9195","Sandollar Productions, Touchstone Pictures"


In [64]:
# Mostrar dimenciones del df
new_df.shape

(45433, 3)

In [66]:
# # Guardar el DataFrame en un nuevo archivo CSV llamado "production_companies.csv"
# new_df.to_csv('production_companies_result.csv', index=False)