In [594]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import FeatureHasher
from funciones_varias import cargaCsvToDataFrame
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

dfMoviesFinal = cargaCsvToDataFrame("dfMoviesFinal", "datasets_eda")

In [595]:
def hashEncoding(mainDF, column, numBins):
    lista= list(mainDF[column].unique())
    data={column:[[i]for i in lista]}

    df = pd.DataFrame(data)

    hasher = FeatureHasher(n_features=numBins, input_type='string')

    hashed_features = hasher.transform(df[column])

    hashed_df = pd.DataFrame(hashed_features.toarray(), columns=[f"hashed_{i}" for i in range(numBins)])
    
    return hashed_df

In [596]:
#considero que para el modelo de clasificacion no necesito estas columnas
dfMoviesFinal = dfMoviesFinal.drop(columns=["tagline", "id", "status", "", "budget", 
                                            "release_date","return", "franquicia",
                                            "original_language", "production_country", "language"])

In [597]:
#dropeo estas rows porque considero que tienen generos de pelicula que no me sirven para el modelo, son datos erroneos

dfMoviesFinal.drop(dfMoviesFinal[dfMoviesFinal["movie_genre"]== "Aniplex"].index, inplace=True)
dfMoviesFinal.drop(dfMoviesFinal[dfMoviesFinal["movie_genre"]== "Odyssey Media"].index, inplace=True)
dfMoviesFinal.drop(dfMoviesFinal[dfMoviesFinal["movie_genre"]== "Carousel Productions"].index, inplace=True)

In [598]:
#considero que para poder continuar con el modelo de clasificacion, debo poner el tipo de dato que corresponde a cada columna

strings= ["movie_title", "movie_genre", "overview",
            "director", "character", "actor"]
integers=["release_year", "vote_count"]
floats=["runtime", "vote_average", "popularity", "revenue"]

#dropeo valor raro porque sino no puedo ponerle el data type que corresponde
dfMoviesFinal.drop(dfMoviesFinal[dfMoviesFinal['popularity'] == 'Beware Of Frost Bites'].index, inplace=True)

for i in strings:
    dfMoviesFinal[i] = dfMoviesFinal[i].astype("string")
for i in integers:
    dfMoviesFinal[i] = dfMoviesFinal[i].astype("int")
for i in floats:
    dfMoviesFinal[i].replace("", 0, inplace=True) #hay strings vacios en vote_count por eso los lleno con 0
    dfMoviesFinal[i] = dfMoviesFinal[i].astype("float")

In [599]:
#APLICO HASH ENCODING A VARIAS COLUMNAS

hashed_actor =hashEncoding(dfMoviesFinal, "actor", 10)
#concateno y dropeo la columna
dfMoviesFinal = pd.concat([dfMoviesFinal, hashed_actor], axis=1)
dfMoviesFinal.drop(columns=["actor"], inplace=True)

hashed_director =hashEncoding(dfMoviesFinal, "director", 10)
#concateno y dropeo la columna
dfMoviesFinal = pd.concat([dfMoviesFinal, hashed_director], axis=1)
dfMoviesFinal.drop(columns=["director"], inplace=True)

hashed_company =hashEncoding(dfMoviesFinal, "production_company", 10)
#concateno y dropeo la columna
dfMoviesFinal = pd.concat([dfMoviesFinal, hashed_company], axis=1)
dfMoviesFinal.drop(columns=["production_company"], inplace=True)

hashed_character =hashEncoding(dfMoviesFinal, "character", 10)
#concateno y dropeo la columna
dfMoviesFinal = pd.concat([dfMoviesFinal, hashed_character], axis=1)
dfMoviesFinal.drop(columns=["character"], inplace=True)

dfMoviesFinal=dfMoviesFinal.fillna(0)



In [600]:
#uso feature extraction para trabajar con los resultados del EDA sobre wordclouds

#nube de palabras para la columna title
#palabras mas repetidas: love, day, man, girl

#keywords para que arme las columnas
keywordsTitle = ["day", "man", "girl"]

#origen del texto
text = dfMoviesFinal["movie_title"]

#inicializo la vectorizacion
vectorizer = CountVectorizer(vocabulary=keywordsTitle, lowercase=True)

#cuenta cuantas veces una keyword esta en un movie title
feature = vectorizer.transform(dfMoviesFinal["movie_title"])

#crea df que almacena cuantas veces se cuenta en el titulo de cada pelicula cada feature
feature_dfTitles = pd.DataFrame(feature.toarray(), columns=vectorizer.get_feature_names_out())

#concateno y dropeo la columna movie_title
dfMoviesFinal = pd.concat([feature_dfTitles, dfMoviesFinal], axis=1)
dfMoviesFinal.drop(columns=["movie_title"], inplace=True)


In [601]:
#nube de palabras para la columna overview
#palabras mas repetidas: life, find, love, one

#keywords para que arme las columnas
keywordsOverview = ["life", "find", "love", "one"]

#origen del texto
text = dfMoviesFinal["overview"]

#inicializo la vectorizacion
vectorizer = CountVectorizer(vocabulary=keywordsOverview, lowercase=True)

#cuenta cuantas veces una keyword esta en un movie title
feature = vectorizer.transform(dfMoviesFinal["overview"])

#crea df que almacena cuantas veces se cuenta en el titulo de cada pelicula cada feature
feature_dfOverview = pd.DataFrame(feature.toarray(), columns=vectorizer.get_feature_names_out())

#concateno y dropeo la columna movie_title
dfMoviesFinal = pd.concat([feature_dfOverview, dfMoviesFinal], axis=1)
dfMoviesFinal.drop(columns=["overview"], inplace=True)
dfMoviesFinal

Unnamed: 0,life,find,love,one,day,man,girl,movie_genre,popularity,release_year,...,hashed_0,hashed_1,hashed_2,hashed_3,hashed_4,hashed_5,hashed_6,hashed_7,hashed_8,hashed_9
0,0,0,0,0,0,0,0,Animation,21.946943,1995,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0,1,0,0,0,0,0,Adventure,17.015539,1995,...,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0
2,0,0,0,0,0,0,0,Romance,11.712900,1995,...,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0
3,0,1,0,0,0,0,0,Comedy,3.859495,1995,...,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0,0,0,0,0,0,Comedy,8.387519,1995,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50068,0,0,0,0,0,0,0,Genero Desconocido,5.683753,1991,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50069,0,0,0,0,0,0,0,Genero Desconocido,0.178241,2011,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50070,0,0,0,1,0,0,0,Genero Desconocido,0.903007,2003,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50071,1,0,0,3,0,0,0,Genero Desconocido,0.003503,1917,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [602]:


dfMoviesFinal.columns
#strings=production_company

Index(['life', 'find', 'love', 'one', 'day', 'man', 'girl', 'movie_genre',
       'popularity', 'release_year', 'revenue', 'runtime', 'vote_average',
       'vote_count', 'hashed_0', 'hashed_1', 'hashed_2', 'hashed_3',
       'hashed_4', 'hashed_5', 'hashed_6', 'hashed_7', 'hashed_8', 'hashed_9',
       'hashed_0', 'hashed_1', 'hashed_2', 'hashed_3', 'hashed_4', 'hashed_5',
       'hashed_6', 'hashed_7', 'hashed_8', 'hashed_9', 'hashed_0', 'hashed_1',
       'hashed_2', 'hashed_3', 'hashed_4', 'hashed_5', 'hashed_6', 'hashed_7',
       'hashed_8', 'hashed_9', 'hashed_0', 'hashed_1', 'hashed_2', 'hashed_3',
       'hashed_4', 'hashed_5', 'hashed_6', 'hashed_7', 'hashed_8', 'hashed_9'],
      dtype='object')

In [603]:
#COMIENZO CON EL MODELO DE MACHINE LEARNING

#asigno X y Y
y= dfMoviesFinal["movie_genre"]
X = dfMoviesFinal.drop(columns=["movie_genre"])

#hago los splits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


#PRUEBO ESCALAR MIS COLUMNAS
columnas_num = ["popularity", "runtime", "vote_average", "revenue"]
scaler = StandardScaler()

X_train[columnas_num] = scaler.fit_transform(X_train[columnas_num])
X_test[columnas_num] = scaler.transform(X_test[columnas_num])


#creo el modelo
tree = DecisionTreeClassifier(max_depth=5)

#entreno el modelo
tree.fit(X_train, y_train)

#predicciones
y_pred = tree.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.25661507738392414
