In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import pickle

# Carregar o dataset
df = pd.read_csv('desafio_indicium_imdb.csv')

# Remover linhas com valores ausentes nas colunas-chave e remover duplicatas
df.dropna(subset=['IMDB_Rating', 'Genre', 'Director', 'Gross', 'No_of_Votes', 'Meta_score'], inplace=True)
df.drop_duplicates(inplace=True)

# 1. Selecionar as variáveis (features) e o alvo (target)
features = ['Genre', 'Director', 'Gross', 'No_of_Votes', 'Meta_score']
target = 'IMDB_Rating'

# Usar .loc para evitar o SettingWithCopyWarning
X = df.loc[:, features].copy()
y = df.loc[:, target].copy()

# 2. Pré-processamento e Transformação das Variáveis
# Lidar com os valores de Gross de forma segura
X['Gross'] = X['Gross'].apply(lambda x: float(x.replace(',', '')) if isinstance(x, str) else x)
X.loc[:, 'Gross_log'] = np.log1p(X['Gross'])

# Transformar variáveis categóricas
X.loc[:, 'Genre_main'] = X['Genre'].apply(lambda x: x.split(',')[0].strip())

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['Genre_main', 'Director']),
        ('num', StandardScaler(), ['Gross_log', 'No_of_Votes', 'Meta_score'])
    ])

# 3. Dividir os dados em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Criar e treinar o modelo de Regressão
model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))])

model_pipeline.fit(X_train, y_train)

# 5. Fazer previsões e avaliar o modelo
y_pred = model_pipeline.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Erro Quadrático Médio (MSE): {mse:.4f}")
print(f"Coeficiente de Determinação (R²): {r2:.4f}")

Erro Quadrático Médio (MSE): 0.0462
Coeficiente de Determinação (R²): 0.4347


In [2]:
with open('modelo_preditivo_imdb.pkl', 'wb') as file:
    pickle.dump(model_pipeline, file)