In [12]:
# bibliotecas
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [13]:
df = pd.read_csv("./dataframes/desafio_indicium_imdb.csv")

In [14]:
# pré-processamento do dataset
df['Gross'] = df['Gross'].str.replace(',', '').astype(float)
df['Runtime'] = df['Runtime'].str.replace(' min', '').astype(float)
df['Released_Year'] = pd.to_numeric(df['Released_Year'], errors='coerce')

In [15]:
# definindo x e y
X = df[['Released_Year', 'Certificate', 'Runtime', 'Genre', 'Meta_score', 'No_of_Votes', 'Gross']]
y = df['IMDB_Rating']

In [16]:
# definindo colunas categóricas e numéricas
categorical = ['Certificate', 'Genre']
numeric = ['Released_Year', 'Runtime', 'Meta_score', 'No_of_Votes', 'Gross']

# pré-processamento
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),   # substitui NaN pela categoria mais frequente
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical),

        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),  # substitui NaN pela mediana
        ]), numeric)
    ]
)

In [17]:
# criação do modelo
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42, n_estimators=200))
])

In [18]:
# divisão treino/teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model.fit(X_train, y_train)

In [19]:
# teste
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse:.2f}")

RMSE: 0.20


In [20]:
# filme a ser testado
novo_filme = pd.DataFrame([{
    'Released_Year': 1994,
    'Certificate': 'A',
    'Runtime': 142,
    'Genre': 'Drama',
    'Meta_score': 20,
    'No_of_Votes': 110000,
    'Gross': 90000000
}])

In [21]:
# previsão
nota_prevista = model.predict(novo_filme)
print(f"Nota prevista do IMDb: {nota_prevista[0]:.2f}")

Nota prevista do IMDb: 7.75


In [22]:
# salvando modelo
import joblib
from sklearn.metrics import mean_squared_error

joblib.dump(model, "modelo_imdb.pkl")

['modelo_imdb.pkl']