# PProductions — EDA e Modelagem (IMDb)

Este notebook executa:
1. **EDA** com variáveis numéricas, categóricas e de texto.
2. **Insights** sobre fatores de faturamento (Gross).
3. **Modelagem** da **nota IMDb** (regressão) e **faturamento** (regressão).
4. **Salvamento do modelo** `.pkl`.

In [None]:

import pandas as pd, numpy as np, matplotlib.pyplot as plt
from pathlib import Path
plt.rcParams['figure.figsize'] = (8,5)

DATA_PATH = Path('../data/imdb.csv')  # ajuste se necessário
assert DATA_PATH.exists(), f"Arquivo não encontrado: {DATA_PATH}. Coloque o seu csv em data/imdb.csv"

df_raw = pd.read_csv(DATA_PATH)
df = df_raw.copy()
print(df.shape)
df.head(3)


In [None]:

# Mapeamento tolerante de colunas
def find_col(df, candidates):
    for c in df.columns:
        if c.lower().strip() in [x.lower() for x in candidates]:
            return c
    for c in df.columns:
        for x in candidates:
            if x.lower() in c.lower():
                return c
    return None

col_map = {
    'title': find_col(df, ['Series_Title','Title','Movie','SeriesTitle']),
    'year': find_col(df, ['Released_Year','Year','Release_Year']),
    'cert': find_col(df, ['Certificate','Rating','MPAA']),
    'runtime': find_col(df, ['Runtime','Runtime_Minutes']),
    'genre': find_col(df, ['Genre','Genres']),
    'overview': find_col(df, ['Overview','Description','Plot']),
    'metascore': find_col(df, ['Meta_score','Metascore']),
    'director': find_col(df, ['Director']),
    'star1': find_col(df, ['Star1','Actor1','Lead1']),
    'star2': find_col(df, ['Star2','Actor2','Lead2']),
    'star3': find_col(df, ['Star3','Actor3','Lead3']),
    'star4': find_col(df, ['Star4','Actor4','Lead4']),
    'votes': find_col(df, ['No_of_Votes','Votes','NumVotes']),
    'gross': find_col(df, ['Gross','BoxOffice','Revenue','Worldwide_Gross']),
    'imdb': find_col(df, ['IMDB_Rating','IMDb','Rating_IMDb']),
}
col_map


In [None]:

# Limpeza 
df['runtime_min'] = pd.to_numeric(
    df[col_map['runtime']].astype(str).str.extract(r'(\d+)')[0], errors='coerce'
) if col_map['runtime'] else np.nan

def parse_money(x):
    if pd.isna(x): return np.nan
    s = str(x).replace(',', '').replace('$','').replace('£','').replace('€','')
    try: return float(s)
    except: return np.nan

df['gross_num'] = df[col_map['gross']].apply(parse_money) if col_map['gross'] else np.nan
df['year_num'] = pd.to_numeric(df[col_map['year']], errors='coerce') if col_map['year'] else np.nan
df['votes_num'] = pd.to_numeric(df[col_map['votes']], errors='coerce') if col_map['votes'] else np.nan
df['metascore_num'] = pd.to_numeric(df[col_map['metascore']], errors='coerce') if col_map['metascore'] else np.nan
df['imdb_num'] = pd.to_numeric(df[col_map['imdb']], errors='coerce') if col_map['imdb'] else np.nan

df[['runtime_min','gross_num','year_num','votes_num','metascore_num','imdb_num']].describe()


In [None]:

# Distribuições
for c in ['imdb_num','gross_num','votes_num','metascore_num','runtime_min']:
    if c in df:
        df[c].plot(kind='hist', bins=30, alpha=0.7, title=f'Distribuição: {c}')
        plt.xlabel(c); plt.show()


In [None]:

# Correlação (Spearman)
corr = df[['imdb_num','gross_num','votes_num','metascore_num','runtime_min','year_num']].corr(method='spearman')
corr


In [None]:

# Gêneros
if col_map['genre']:
    genres = df[col_map['genre']].dropna().astype(str).str.get_dummies(sep=',').rename(columns=lambda x: x.strip())
    top_genres = genres.sum().sort_values(ascending=False).head(15)
    top_genres.plot(kind='bar', title='Top gêneros'); plt.ylabel('Contagem'); plt.show()


In [None]:

# Texto: Overview
from wordcloud import WordCloud, STOPWORDS
if col_map['overview']:
    text_all = ' '.join(df[col_map['overview']].dropna().astype(str).tolist())
    wc = WordCloud(stopwords=STOPWORDS, width=1000, height=500).generate(text_all)
    plt.imshow(wc); plt.axis('off'); plt.title('Nuvem de palavras — Overview'); plt.show()

    df['overview_len'] = df[col_map['overview']].astype(str).str.len()
    df['overview_words'] = df[col_map['overview']].astype(str).str.split().apply(len)
    df[['overview_len','overview_words']].describe()


In [None]:

# Hipóteses rápidas
import pandas as pd
hyp = {}
for col in ['votes_num','metascore_num','runtime_min','year_num']:
    if col in df:
        hyp[col] = df[[col,'imdb_num']].corr(method='spearman').iloc[0,1]
pd.Series(hyp, name='Spearman vs IMDb').sort_values(ascending=False)


In [None]:

# Modelagem IMDb
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor

target = 'imdb_num'
assert target in df, "Coluna de alvo IMDb não detectada; ajuste col_map ou nome da coluna no dataset."

cat_cols = [c for c in [col_map['cert'], col_map['genre'], col_map['director'], col_map['star1'], col_map['star2'], col_map['star3'], col_map['star4']] if c]
num_cols = [c for c in ['runtime_min','votes_num','metascore_num','year_num','gross_num'] if c in df]

X = df[cat_cols + num_cols].copy()
y = df[target].copy()

preprocess = ColumnTransformer([
    ('num', StandardScaler(with_mean=False), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore', min_frequency=10), cat_cols),
], remainder='drop')

pipe = Pipeline([('prep', preprocess), ('rf', RandomForestRegressor(n_estimators=400, random_state=42))])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipe.fit(X_train, y_train)
pred = pipe.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared=False)
mae = mean_absolute_error(y_test, pred)
print({'RMSE': rmse, 'MAE': mae})


In [None]:

# Salvar .pkl
import joblib, os
os.makedirs('../models', exist_ok=True)
path = '../models/model_imdb.pkl'
joblib.dump(pipe, path)
path


In [None]:

# Fatores de faturamento (Gross) — 
from sklearn.inspection import permutation_importance
from sklearn.ensemble import GradientBoostingRegressor

if 'gross_num' in df and df['gross_num'].notna().sum() > 50:
    y2 = df['gross_num']
    X2 = df[cat_cols + num_cols].copy()
    X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=42)

    pre2 = ColumnTransformer([
        ('num', StandardScaler(with_mean=False), [X2.columns.get_loc(c) for c in num_cols]),
        ('cat', OneHotEncoder(handle_unknown='ignore', min_frequency=10), [X2.columns.get_loc(c) for c in cat_cols]),
    ], remainder='drop')

    model_rev = Pipeline([('prep', pre2), ('gbr', GradientBoostingRegressor(random_state=42))])
    model_rev.fit(X2_train, y2_train)
    y2_pred = model_rev.predict(X2_test)
    print('RMSE Gross:', mean_squared_error(y2_test, y2_pred, squared=False))

    pi = permutation_importance(model_rev, X2_test, y2_test, n_repeats=5, random_state=42)
    importances = pd.Series(pi.importances_mean, index=model_rev.named_steps['prep'].get_feature_names_out())
    importances.sort_values(ascending=False).head(20)
