In [None]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns
from wordcloud import WordCloud

from funciones_varias import cargaCsvToDataFrame


dfMovies = cargaCsvToDataFrame("dfMovies", "datasets_eda")
dfCrew = cargaCsvToDataFrame("dfCrew", "datasets_eda")
dfCast = cargaCsvToDataFrame("dfCast", "datasets_eda")

In [None]:
dfMovies.info()

In [None]:
dfMovies.describe()


#columnas con 50% o mas de valores en 0 --> budget, franquicia, revenue, tagline, return
#genero mas repetido: Drama; aproximadamente 25% de las peliculas son de Drama
#hay 3 ids 141971
#la columna status no parece muy util ya que mas del 95% de los valores son "Released"
#idioma mas repetido: Inglés (no es sorpresa). Aproximadamente un 70% de las peliculas son en inglés
#la columna overview puede ser de gran ayuda al modelo de categorizacion ya que tiene muchos valores unicos con muchas palabras claves

#para el modelo de machine learning considero que no necesito las siguientes columnas: tagline, id, status

In [None]:
dfMovies.duplicated().sum()

In [None]:
dfMovies["status"].value_counts() #no creo que me sirva mucho esta columna ya que tiene un nivel de variabilidad muy bajo

In [None]:
dfMovies.isnull().sum()

In [None]:
dfMovies.dtypes

In [None]:
#dropeo valor raro para poder analizar popularity
dfMovies.drop(dfMovies[dfMovies['popularity'] == 'Beware Of Frost Bites'].index, inplace=True)

dfMovies.replace("", 0, inplace=True)
dfMovies["budget"] = dfMovies["budget"].astype(float)
dfMovies["popularity"] = dfMovies["popularity"].astype(float) #tiene valor muy raro: 'Beware Of Frost Bites'
dfMovies["revenue"] = dfMovies["revenue"].astype(float)
dfMovies["runtime"] = dfMovies["runtime"].astype(float)
dfMovies["vote_average"] = dfMovies["vote_average"].astype(float)
dfMovies["return"] = dfMovies["return"].astype(float)



In [None]:
plt.figure(figsize=(15, 6))
plt.xticks(rotation=45)
sns.boxenplot(data=dfMovies)
plt.show()

#aparentemente hay bastantes outliers en mi columna revenue pero deberia revisar si son un error o si es algo que esta 
#bien por la diferencia de exito de ciertas peliculas con respecto al resto

In [None]:

dfMovies["revenue"].describe()

In [None]:
#la mayoria de las peliculas son de drama o comedia
#Foreign es el genero con menor cantidad de peliculas
#se mezclaron 3 productoras de peliculas entre los generos; hay una pelicula con cada una de esas productoras como genero
#considero que como su genero no es valido hay que dropearlas antes del modelo de machine learning

dfMovies.drop(dfMovies[dfMovies["movie_genre"]== "Aniplex"].index, inplace=True)
dfMovies.drop(dfMovies[dfMovies["movie_genre"]== "Odyssey Media"].index, inplace=True)
dfMovies.drop(dfMovies[dfMovies["movie_genre"]== "Carousel Productions"].index, inplace=True)

#reemplazo 0 con "Genero Desconocido"
dfMovies["movie_genre"].replace(0, "Genero Desconocido", inplace=True)

plt.figure(figsize=(15, 6))
plt.xticks(rotation=45)
sns.histplot(dfMovies["movie_genre"], bins=20)







In [None]:
#nube de palabras para la columna title
#palabras mas repetidas: love, day, man, girl

#reemplazo 0 con "Valor Desconocido" para que sea tomado como string
dfMovies["movie_title"].replace(0, "Valor Desconocido", inplace=True)

titleStrings = " ".join(dfMovies["movie_title"])

wordCloud= WordCloud(width=800, height=400, background_color= 'white').generate(titleStrings)

plt.figure(figsize=(10, 5))
plt.imshow(wordCloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
#nube de palabras para la columna overview
#palabras mas repetidas: life, find, love, one

#reemplazo 0 con "Valor Desconocido" para que sea tomado como string
dfMovies["overview"].replace(0, "Valor Desconocido", inplace=True)

titleStrings = " ".join(dfMovies["overview"])

wordCloud= WordCloud(width=800, height=400, background_color= 'white').generate(titleStrings)

plt.figure(figsize=(10, 5))
plt.imshow(wordCloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
correlation_columns =["revenue", "budget", "vote_count", "vote_average", "runtime", "return", "popularity"]

dfCorrelation = dfMovies[correlation_columns]

correlation_matrix = dfCorrelation.corr()

plt.figure(figsize=(10,8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix')
plt.show()


#si no interprete mal, mientras mayor es el vote_count mayor tiende a ser el revenue de una pelicula y visceversa,
#budget y revenue tambien tienen una  correlacion relativamente alta de 0,77

In [98]:
#ARMO DF PARA MACHINE LEARNING

#filtro el df con la info que necesito
dfCrewFiltrado = dfCrew.loc[dfCrew["job"]=="Director", ["id", "name"]]
dfCrewFiltrado["director"] = dfCrewFiltrado["name"]
dfCrewFiltrado.drop(columns=["name"], inplace=True)

#filtro el df con la info que necesito
dfCastFiltrado = dfCast.loc[dfCast["order"]=="0.0", ["id","name","character"]]
dfCastFiltrado["actor"] = dfCastFiltrado["name"]
dfCastFiltrado.drop(columns=["name"], inplace=True)

#hago join de ambos dfs
dfCastCrew = dfCrewFiltrado.merge(dfCastFiltrado, on="id",how="left")

#lleno valores nulls con string
dfCastCrew["actor"].fillna("Valor Desconocido", inplace=True)
dfCastCrew["character"].fillna("Valor Desconocido", inplace=True)
dfCastCrew.isna().sum()

#join final
dfMoviesFinal = dfMovies.merge(dfCastCrew, on="id",how="left")

#exporto el otro de los dfs que voy a usar para la sexta funcion
dfMoviesFinal.to_csv(r"C:\Users\\Administrator\\Desktop\\dfMoviesFinal.csv")


Unnamed: 0,Unnamed: 1,movie_title,budget,movie_genre,franquicia,id,original_language,overview,popularity,production_company,...,runtime,language,status,tagline,vote_average,vote_count,return,director,character,actor
0,0,Toy Story,30000000.0,Animation,Toy Story Collection,862,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,Pixar Animation Studios,...,81.0,English,Released,0,7.7,5415,12.45,John Lasseter,Woody (voice),Tom Hanks
1,1,Jumanji,65000000.0,Adventure,0,8844,en,When siblings Judy and Peter discover an encha...,17.015539,TriStar Pictures,...,104.0,English,Released,Roll the dice and unleash the excitement!,6.9,2413,4.04,Joe Johnston,Alan Parrish,Robin Williams
2,2,Grumpier Old Men,0.0,Romance,Grumpy Old Men Collection,15602,en,A family wedding reignites the ancient feud be...,11.712900,Warner Bros.,...,101.0,English,Released,Still Yelling. Still Fighting. Still Ready for...,6.5,92,0.00,Howard Deutch,Max Goldman,Walter Matthau
3,3,Waiting to Exhale,16000000.0,Comedy,0,31357,en,"Cheated on, mistreated and stepped on, the wom...",3.859495,Twentieth Century Fox Film Corporation,...,127.0,English,Released,Friends are the people who let you be yourself...,6.1,34,5.09,Forest Whitaker,Savannah 'Vannah' Jackson,Whitney Houston
4,4,Father of the Bride Part II,0.0,Comedy,Father of the Bride Collection,11862,en,Just when George Banks has recovered from his ...,8.387519,Sandollar Productions,...,106.0,English,Released,Just When His World Is Back To Normal... He's ...,5.7,173,0.00,Charles Shyer,George Banks,Steve Martin
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50068,45460,Robin Hood,0.0,Genero Desconocido,0,30840,en,"Yet another version of the classic epic, with ...",5.683753,0,...,104.0,0,Released,0,5.7,26,0.00,John Irvin,Sir Robert Hode,Patrick Bergin
50069,45462,Century of Birthing,0.0,Genero Desconocido,0,111109,tl,An artist struggles to finish his work while a...,0.178241,0,...,360.0,0,Released,0,9.0,3,0.00,Lav Diaz,Sister Angela,Angel Aquino
50070,45463,Betrayal,0.0,Genero Desconocido,0,67758,en,"When one of her hits goes wrong, a professiona...",0.903007,0,...,90.0,0,Released,A deadly game of wits.,3.8,6,0.00,Mark L. Lester,Emily Shaw,Erika Eleniak
50071,45464,Satan Triumphant,0.0,Genero Desconocido,0,227506,en,"In a small town live two brothers, one a minis...",0.003503,0,...,87.0,0,Released,0,0.0,0,0.00,Yakov Protazanov,,Iwan Mosschuchin
