# PROYECTO INDIVIDUAL Nº1 (MLOps)

## Importar librerías

In [1]:
import pandas as pd
import ast

## Carga de datos

In [2]:
# Cargar los datos del archivo CSV
#df = pd.read_csv('D:\Soy_HENRY\PI_ML_OPS\Fuente_datos\movies_dataset.csv', low_memory=False)
df = pd.read_csv('Fuente_datos/movies_dataset.csv', low_memory=False)
#df.info()

## Transformaciones

### - Desanidar columnas

In [3]:
# Desanidar la columna belongs_to_collection
def extract_belongs(x):
    if pd.notnull(x):
        try:
            data = ast.literal_eval(x)
            if isinstance(data, dict):
                return [data['name']]
            elif isinstance(data, list):
                return [i['name'] for i in data]
        except (ValueError, TypeError):
            pass
    return []

# Desanidar la columna genres
def extract_genres(x):
    try:
        data = ast.literal_eval(x)
        return [i['name'] for i in data]
    except (ValueError, TypeError):
        return []

# Desanidar la columna production_companies, production_countries y spoken_languages
def extract_list_values(x):
    try:
        data = ast.literal_eval(x)
        return [i['name'] for i in data]
    except (ValueError, TypeError):
        return []

# Aplicar las funciones de extracción a las columnas correspondientes
df['belongs_to_collection'] = df['belongs_to_collection'].apply(extract_belongs)
df['genres'] = df['genres'].apply(extract_genres)
df['production_companies'] = df['production_companies'].apply(extract_list_values)
df['production_countries'] = df['production_countries'].apply(extract_list_values)
df['spoken_languages'] = df['spoken_languages'].apply(extract_list_values)

# Mostrar el resultado
#df.head(2)

### - Los valores nulos: 'revenue' y 'budget', deben ser rellenados por el número 0.

In [4]:
# Rellenar los valores nulos en las columnas "revenue" y "budget" con 0
df['revenue'] = df['revenue'].fillna(0)
df['budget'] = df['budget'].fillna(0)

# Mostrar el resultado
#df.head(2)
#df[df['revenue'] == 0].head(2)
#df[df['budget'] == str(0)].head(2)

### - Los valores nulos: 'release_date' deben eliminarse.

In [5]:
# Eliminar los valores nulos del campo "release_date"
df = df.dropna(subset=['release_date'])
df.shape

(45379, 24)

### - Formato fechas: AAAA-mm-dd y crear la columna 'release_year'

In [6]:
# Convertir la columna "release_date" al tipo de dato de fecha y formatearla
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce').dt.strftime('%Y-%m-%d')

# Eliminar las filas con valores nulos en la columna "release_date"
#df = df.dropna(subset=['release_date'])

# Crear la columna "release_year" extrayendo el año de la fecha de estreno
df['release_year'] = pd.to_datetime(df['release_date']).dt.year

# Mostrar el resultado
df['release_date'].head(2)

0    1995-10-30
1    1995-12-15
Name: release_date, dtype: object

### - Crear la columna: 'return'

In [7]:
# Convertir los valores de la columna "budget" a tipo numérico
df['budget'] = pd.to_numeric(df['budget'], errors='coerce')

# Crear la columna "return" con el cálculo del retorno de inversión
df['return'] = df['revenue'] / df['budget']
df['return'] = df['return'].fillna(0)

# Mostrar el resultado
df['return'].head(2)

0    12.451801
1     4.043035
Name: return, dtype: float64

### - Eliminar las columnas que no serán utilizadas

In [8]:
# Lista de columnas a eliminar
columnas_eliminar = ['video', 'imdb_id', 'adult', 'original_title', 'vote_count', 'poster_path', 'homepage']

# Eliminar las columnas
df = df.drop(columnas_eliminar, axis=1)

# Mostrar el resultado
df.head(2)

Unnamed: 0,belongs_to_collection,budget,genres,id,original_language,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,release_year,return
0,[Toy Story Collection],30000000.0,"[Animation, Comedy, Family]",862,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,[Pixar Animation Studios],[United States of America],1995-10-30,373554033.0,81.0,[English],Released,,Toy Story,7.7,1995.0,12.451801
1,[],65000000.0,"[Adventure, Fantasy, Family]",8844,en,When siblings Judy and Peter discover an encha...,17.015539,"[TriStar Pictures, Teitler Film, Interscope Co...",[United States of America],1995-12-15,262797249.0,104.0,"[English, Français]",Released,Roll the dice and unleash the excitement!,Jumanji,6.9,1995.0,4.043035


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45379 entries, 0 to 45465
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   belongs_to_collection  45379 non-null  object 
 1   budget                 45376 non-null  float64
 2   genres                 45379 non-null  object 
 3   id                     45379 non-null  object 
 4   original_language      45368 non-null  object 
 5   overview               44438 non-null  object 
 6   popularity             45377 non-null  object 
 7   production_companies   45379 non-null  object 
 8   production_countries   45379 non-null  object 
 9   release_date           45376 non-null  object 
 10  revenue                45379 non-null  float64
 11  runtime                45130 non-null  float64
 12  spoken_languages       45379 non-null  object 
 13  status                 45296 non-null  object 
 14  tagline                20398 non-null  object 
 15  title  

In [10]:
df.to_csv('df_modificada.csv', index=False)