# **Parte 1**

### **DataSet**

In [None]:
import pandas as pd

df = pd.read_csv("imdb_movies.csv")
df

### **Tratamento de Dados**

**Valores em Falta e Escolha de Variáveis**

In [None]:
#df.info() #2 colunas com valores em falta ('genre' e 'crew' )

df['genre'].fillna('Unknown', inplace=True) #para não remover esses filmes, cria-se um género 'Desconhecido'

df.drop('crew', axis=1, inplace=True) 
df.drop('overview', axis=1, inplace=True) 
df.drop('orig_title', axis=1, inplace=True) 

df.info()


### **Análise Exploratória de Dados**

**Estastísticas Descritivas**

In [None]:
df.describe()

**Visualização de Dados**

In [None]:
import matplotlib.pyplot as plt

#Score
plt.figure(figsize=(12, 6))
plt.hist(df['score'], bins=20, color='blue', alpha=0.7)
plt.title('Distribuição da Pontuação dos Filmes')
plt.xlabel('Pontuação')
plt.ylabel('Frequência')
plt.grid(True)
plt.show()

#Orçamento
plt.figure(figsize=(12, 6))
plt.hist(df['budget_x'], bins=20, color='green', alpha=0.7)
plt.title('Distribuição do Orçamento dos Filmes')
plt.xlabel('Orçamento')
plt.ylabel('Frequência')
plt.grid(True)
plt.show()

#Receita 
plt.figure(figsize=(12, 6))
plt.hist(df['revenue'], bins=20, color='red', alpha=0.7)
plt.title('Distribuição da Receita dos Filmes')
plt.xlabel('Receita')
plt.ylabel('Frequência')
plt.grid(True)
plt.show()


**Identificar Outliers**

In [None]:
#Score
plt.figure(figsize=(8, 4))
plt.boxplot(df['score'], vert=False)
plt.title('Box Plot da Pontuação dos Filmes')
plt.xlabel('Pontuação')
plt.show()

#Orçamento
plt.figure(figsize=(8, 4))
plt.boxplot(df['budget_x'], vert=False)
plt.title('Box Plot do Orçamento dos Filmes')
plt.xlabel('Orçamento')
plt.show()

#Receita
plt.figure(figsize=(8, 4))
plt.boxplot(df['revenue'], vert=False)
plt.title('Box Plot da Receita dos Filmes')
plt.xlabel('Receita')
plt.show()


**Análise de Correlação**

In [None]:
import seaborn as sns

#converter variáveis categóricas para códigos numéricos
categorical_vars = ['genre', 'status', 'orig_lang', 'country']
for var in categorical_vars:
    df[var] = pd.Categorical(df[var]).codes

#transformar datas 
if df['date_x'].dtype == 'object':
    df['date_x'] = pd.to_datetime(df['date_x']).dt.strftime('%Y%m%d').astype(int)

val_matrix = df[['date_x', 'score', 'genre', 'status', 'orig_lang', 'budget_x', 'revenue', 'country']]

#matriz de correlação
correlation_matrix = val_matrix.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Matriz de Correlação entre as Variáveis Selecionadas')
plt.show()


### **Rede Bayesiana**

**Construção e Visualização do Modelo/Rede**

In [None]:
from pgmpy.models import BayesianNetwork
from pgmpy.factors.discrete import TabularCPD
import networkx as nx

#definir estrutura da rede
model = BayesianNetwork([
    ('budget_x', 'revenue'),
    ('budget_x', 'date_x'),
    ('status', 'score'),
    ('country', 'revenue'),
    ('orig_lang', 'revenue'),
    ('budget_x', 'orig_lang'),
    ('country', 'genre')
])

#CPD para 'budget_x' (Assumindo 3 categorias de orçamento: baixo, médio, alto)
cpd_budget_x = TabularCPD(variable='budget_x', variable_card=3,
                          values=[[0.3], [0.4], [0.3]])

#CPD para 'revenue' dependente de 'budget_x', 'orig_lang' e 'country'
cpd_revenue = TabularCPD(variable='revenue', variable_card=3,
                         values=[[0.1, 0.2, 0.3, 0.2, 0.3, 0.4, 0.2, 0.3, 0.4, 0.1, 0.2, 0.3, 0.2, 0.3, 0.4, 0.2, 0.3, 0.4],
                                 [0.3, 0.2, 0.1, 0.4, 0.2, 0.1, 0.4, 0.2, 0.1, 0.3, 0.2, 0.1, 0.4, 0.2, 0.1, 0.4, 0.2, 0.1],
                                 [0.6, 0.6, 0.6, 0.4, 0.5, 0.5, 0.4, 0.5, 0.5, 0.6, 0.6, 0.6, 0.4, 0.5, 0.5, 0.4, 0.5, 0.5]],
                         evidence=['budget_x', 'orig_lang', 'country'],
                         evidence_card=[3, 2, 3])

#CPD para 'date_x' dependente de 'budget_x'
cpd_date_x = TabularCPD(variable='date_x', variable_card=3,
                        values=[[0.4, 0.2, 0.1],
                                [0.3, 0.5, 0.3],
                                [0.3, 0.3, 0.6]],
                        evidence=['budget_x'],
                        evidence_card=[3])

#CPD para 'score' dependente de 'status'
cpd_score = TabularCPD(variable='score', variable_card=3,
                       values=[[0.5, 0.3, 0.2],
                               [0.2, 0.5, 0.3],
                               [0.3, 0.2, 0.5]],
                       evidence=['status'],
                       evidence_card=[3])

cpd_genre = TabularCPD(variable='genre', variable_card=3,
                       values=[[0.5, 0.2, 0.3],
                               [0.3, 0.5, 0.2],
                               [0.2, 0.3, 0.5]],
                       evidence=['country'],
                       evidence_card=[3])

#CPD para 'status' (Assumindo 3 estados: em desenvolvimento, lançado, cancelado)
cpd_status = TabularCPD(variable='status', variable_card=3,
                        values=[[0.2], [0.7], [0.1]])

#CPD para 'orig_lang' dependente de 'budget_x'
cpd_orig_lang = TabularCPD(variable='orig_lang', variable_card=2,
                           values=[[0.7, 0.3, 0.1],
                                   [0.3, 0.7, 0.9]],
                           evidence=['budget_x'],
                           evidence_card=[3])

#CPD para 'country' (Assumindo independência)
cpd_country = TabularCPD(variable='country', variable_card=3,
                         values=[[0.4], [0.4], [0.2]])

model.add_cpds(cpd_budget_x, cpd_revenue, cpd_date_x, cpd_score, cpd_status, cpd_orig_lang, cpd_country, cpd_genre)

#verificar modelo
print("Model:", model.check_model())

#criar grafo através do modelo
G = nx.DiGraph()
G.add_edges_from(model.edges())

#posições
pos = {
    'budget_x': (1, 1),
    'revenue': (2, 2),
    'date_x': (0, 0),
    'status': (-1, 2),
    'score': (-2, 1),
    'country': (3, 1),
    'orig_lang': (2, 1),
    'genre' : (4,0)
}

nx.draw(G, pos, with_labels=True, node_size=2000, node_color='lightblue', font_size=12, arrowstyle='-|>', arrowsize=20)
plt.title('Rede Bayesiana das Relações do Filme')
plt.show()

**Inferências Condicionais**

In [None]:
from pgmpy.inference import VariableElimination

infer = VariableElimination(model)

#probabilidade de 'revenue' dado que 'budget_x' é médio
result = infer.query(variables=['revenue'], evidence={'budget_x': 1})

#múltiplas evidências
result_multi_evidence = infer.query(variables=['revenue'], evidence={'budget_x': 2, 'orig_lang': 0, 'country': 2})

print(result_multi_evidence)
print(result)

In [None]:
#Inferência com variação nas condições de orçamento
for budget in range(3):  #0 = baixo, 1 = médio, 2 = alto
    for country in range(3):  # assumindo três países diferentes
        result = infer.query(variables=['revenue'], evidence={'budget_x': budget, 'country': country})
        print(f"Orçamento: {budget}, País: {country}, Distribuição de Receita: {result}")

#Impacto do status do filme na pontuação
for status in range(3):  #0 = em desenvolvimento, 1 = lançado, 2 = cancelado
    result = infer.query(variables=['score'], evidence={'status': status})
    print(f"Status: {status}, Distribuição de Pontuação: {result}")


# **Parte 2**

### **Redução dos Posters**

In [None]:
"""
Script to resize images in a directory.
IMPORTANT NOTE: it was executed locally in vscode, not here.
"""

from PIL import Image
import os
import pandas as pd

def resize_images(source_directory, target_directory, target_size=(128, 128)):
    for filename in os.listdir(source_directory):
      if filename.endswith('.jpg'):
        try:
          #Load the image
          image_path = os.path.join(source_directory, filename)
          image = Image.open(image_path)
          # Resize the image
          resized_img = image.resize(target_size)
          # Save the resized image in a new directory
          save_path = os.path.join(target_directory, filename)
          # Save the resized image
          resized_img.save(save_path)
          print(f"Resized {filename} to {target_size}, saved in {target_directory}")
        except Exception as e:
            print(f'Corrupted image. Failed to resize {filename}: {e}')

# source_image_directory = 'archive/covers'
# target_image_directory = 'archive2/resized_covers'

# Resize images to the directory to (128,128)
# resize_images(source_image_directory, target_image_directory)

### **Merge dos DataSet**

**Pré-Processamento de Dados**

In [None]:
df_ratings = pd.read_csv('ratings.csv')
df_movies = pd.read_csv("movies.csv")

df_high_rating = df_ratings[df_ratings['rating'] >= 4] #só rating supeior a 4
#df_generos = df_movies[df_movies['genres'].isin(['Comedy', 'Romance'])] #só comédias ou romances

df_merge = pd.merge(df_movies, df_high_rating, on='movieId') #merge com o movieID
df_main = df_merge.drop_duplicates(subset='movieId', keep='first') #remover duplicados

#df_main.info()
#df_main.set_index('movieId', inplace=True) #'movieId' como o índice do DataFrame
df_main.info()

**Redimensionamento das Imagens**

In [None]:
from PIL import Image
import os

image_folder = '/Users/guilhermecardoso/Desktop/Universidade/3 Ano/2 Semestre/Aprendizagem Probabilística e Reconhecimento de Padrões/Projeto/archive2/resized_covers'

desired_size = (128, 128) 

def resize_image(image_path):
    try:
        with Image.open(image_path) as img:
            resized_img = img.resize(desired_size)
            return resized_img
    except FileNotFoundError:
        return None

df_main['image_path'] = df_main['movieId'].apply(lambda x: os.path.join(image_folder, f"{x:07d}.jpg"))

#Aplicar a função para redimensionar as imagens e armazenar numa nova coluna, 'image_path'
df_main['resized_image'] = df_main['image_path'].apply(resize_image)



**Remover Filmes Sem Imagem**

In [None]:
df_main = df_main.dropna(subset=['resized_image'])
#df_main

**Verificar Imagem**

In [None]:
import matplotlib.pyplot as plt

specific_movie_id = 235  #exemplo

image_row = df_main[df_main['movieId'] == specific_movie_id]

if not image_row.empty and image_row.iloc[0]['resized_image'] is not None:
    plt.imshow(image_row.iloc[0]['resized_image'])
    plt.title(f"Imagem Redimensionada para movieId {specific_movie_id}")
    plt.axis('off') 
    plt.show()
else:
    print(f"Nenhuma imagem processada disponível para movieId {specific_movie_id}")


**Normalização dos Píxeis**

In [None]:
import numpy as np
from PIL import Image

#Converter imagens para RGB e normalizar os pixels
def normalize_image(img):
    if img.mode != 'RGB':
        img = img.convert('RGB')
    img_array = np.array(img)
    normalized_img = img_array / 255.0  #valores entre 0 e 1
    return normalized_img

df_main['normalized_image'] = df_main['resized_image'].apply(normalize_image)

if len(set(df_main['normalized_image'].apply(lambda x: x.shape).unique())) > 1:
    print("Existem imagens de tamanhos diferentes.")
else:
    print("Todas as imagens estão padronizadas em tamanho e formato.")


**Preparação dos Dados**

In [None]:
from sklearn.model_selection import train_test_split

df_main['normalized_image'] = df_main['normalized_image'].apply(lambda img: np.array(img) if not isinstance(img, np.ndarray) else img)
images_array = np.stack(df_main['normalized_image'].values)

X_train, X_test = train_test_split(images_array, test_size=0.2, random_state=42)

print("Dimensões do Conjunto de Treinamento:", X_train.shape)
print("Dimensões do Conjunto de Teste:", X_test.shape)


**Modelo VAE**

In [None]:
from tensorflow.keras.layers import Input, Dense, Lambda, Flatten, Reshape, Conv2D, Conv2DTranspose
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K

def sampling(args):
    z_mean, z_log_var = args #média e logaritmo da variância da distribuição latente
    batch = K.shape(z_mean)[0]
    dim = K.int_shape(z_mean)[1]
    epsilon = K.random_normal(shape=(batch, dim))
    return z_mean + K.exp(0.5 * z_log_var) * epsilon

input_shape = (128, 128, 3) 
latent_dim = 32  

inputs = Input(shape=input_shape, name='encoder_input')
x = Conv2D(32, (3, 3), activation='relu', padding='same')(inputs) #primeira camada
x = Conv2D(64, (3, 3), activation='relu', strides=(2, 2), padding='same')(x) #segunda camada , stride de 2x2 que diminui a dimensao da imagem pela metade
x = Flatten()(x) #transforma em vetor
z_mean = Dense(latent_dim, name='z_mean')(x) 
z_log_var = Dense(latent_dim, name='z_log_var')(x)
z = Lambda(sampling, output_shape=(latent_dim,), name='z')([z_mean, z_log_var])
encoder = Model(inputs, [z_mean, z_log_var, z], name='encoder')

latent_inputs = Input(shape=(latent_dim,), name='z_sampling')
x = Dense(64 * 64, activation='relu')(latent_inputs)
x = Reshape((64, 64, 1))(x)  #64x64 com um canal
x = Conv2DTranspose(64, (3, 3), activation='relu', strides=(2, 2), padding='same')(x) #dobra a dimensao
x = Conv2DTranspose(32, (3, 3), activation='relu', padding='same')(x)
outputs = Conv2D(3, (3, 3), activation='sigmoid', padding='same')(x) #ativação sigmoide para normalizar a saída na faixa [0, 1]
decoder = Model(latent_inputs, outputs, name='decoder')

class VAE(Model):
    def __init__(self, encoder, decoder, **kwargs):
        super(VAE, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder

    def call(self, inputs, training=False):
        z_mean, z_log_var, z = self.encoder(inputs)
        reconstructed = self.decoder(z)
        reconstruction_loss = K.mean(K.square(inputs - reconstructed)) #calcula a perda de reconstrução como a média do erro quadrático entre a entrada e a saída reconstruída
        reconstruction_loss *= input_shape[0] * input_shape[1] #calcula a perda de divergência KL (Kullback-Leibler) entre a distribuição latente aproximada e a distribuição normal padrão.
        kl_loss = -0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
        total_loss = K.mean(reconstruction_loss + kl_loss)
        self.add_loss(total_loss)
        return reconstructed

vae = VAE(encoder, decoder)
vae.compile(optimizer='adam')


history = vae.fit(X_train, epochs=100, batch_size=32, validation_split=0.2)



In [None]:

test_images = X_test[:10]  
reconstructed_images = vae.predict(test_images)

import matplotlib.pyplot as plt

fig, axes = plt.subplots(2, 10, figsize=(20, 4))
for i in range(10):
    ax = axes[0, i]
    ax.imshow(test_images[i], cmap='gray')
    ax.axis('off')
    ax.set_title("Original")

    ax = axes[1, i]
    ax.imshow(reconstructed_images[i], cmap='gray')
    ax.axis('off')
    ax.set_title("Reconstruída")

plt.show()


**Modelo cVAE**

In [None]:
import numpy as np
import os
import matplotlib.pyplot as plt
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from tensorflow.keras.layers import Input, Dense, Lambda, Flatten, Reshape, Concatenate, Conv2D, Conv2DTranspose
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K
from tensorflow.keras.optimizers import Adam

def load_and_process_images(image_folder, df_main, image_size=(128, 128)):
    images = []
    valid_genres = []
    df_main['movieId'] = df_main['movieId'].fillna(0).astype(int)
    for index, row in df_main.iterrows():
        image_filename = f"{row['movieId']:07d}.jpg"
        image_path = os.path.join(image_folder, image_filename)
        if os.path.exists(image_path):
            with Image.open(image_path) as img:
                img = img.resize(image_size)
                img = np.array(img)
                if img.shape == (128, 128, 3):
                    images.append(img)
                    valid_genres.append(row['genres'].split('|') if isinstance(row['genres'], str) else ['Unknown'])
    images = np.array(images) / 255.0
    mlb = MultiLabelBinarizer()
    genres_encoded = mlb.fit_transform(valid_genres) #transformar a lista de gêneros em um formato binarizado
    return images, genres_encoded, mlb.classes_

images, genres_encoded, genre_names = load_and_process_images(image_folder, df_main)
images_train, images_test, genres_train, genres_test = train_test_split(images, genres_encoded, test_size=0.2, random_state=42)

input_shape = (128, 128, 3)
latent_dim = 32
num_genres = len(genre_names)

def sampling(args):
    z_mean, z_log_var = args
    epsilon = K.random_normal(shape=(K.shape(z_mean)[0], K.shape(z_mean)[1]))
    return z_mean + K.exp(0.5 * z_log_var) * epsilon

inputs = Input(shape=input_shape)
genre_inputs = Input(shape=(num_genres,))
x = Conv2D(32, (3, 3), activation='relu', padding='same')(inputs)
x = Conv2D(64, (3, 3), activation='relu', strides=(2, 2), padding='same')(x)
x = Flatten()(x)
x = Concatenate()([x, genre_inputs]) #combinar a saida com os generos
z_mean = Dense(latent_dim)(x)
z_log_var = Dense(latent_dim)(x)
z = Lambda(sampling)([z_mean, z_log_var])

encoder = Model([inputs, genre_inputs], [z_mean, z_log_var, z])

latent_inputs = Input(shape=(latent_dim,))
decoder_input = Concatenate()([latent_inputs, genre_inputs])
x = Dense(64 * 64, activation='relu')(decoder_input)
x = Reshape((64, 64, 1))(x)
x = Conv2DTranspose(64, (3, 3), activation='relu', strides=(2, 2), padding='same')(x)
x = Conv2DTranspose(32, (3, 3), activation='relu', padding='same')(x)
outputs = Conv2D(3, (3, 3), activation='sigmoid', padding='same')(x)

decoder = Model([latent_inputs, genre_inputs], outputs)

class VAE(Model):
    def __init__(self, encoder, decoder, **kwargs):
        super(VAE, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder

    def call(self, inputs):
        images, genres = inputs
        z_mean, z_log_var, z = self.encoder([images, genres])
        reconstructed = self.decoder([z, genres])
        reconstruction_loss = K.mean(K.square(images - reconstructed)) #perda de reconstrução como a média do erro quadrático entre a entrada e a saída reconstruída
        kl_loss = -0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1) #perda de divergência KL
        self.add_loss(K.mean(reconstruction_loss + kl_loss))
        return reconstructed

cvae = VAE(encoder, decoder)
cvae.compile(optimizer=Adam(0.001))
cvae.fit([images_train, genres_train], epochs=80, batch_size=32)

action_vector = np.zeros((1, num_genres))
action_index = np.where(genre_names == 'Action')[0][0]
action_vector[0, action_index] = 1
z_sample = np.random.normal(size=(1, latent_dim))
generated_image = decoder.predict([z_sample, action_vector])

plt.imshow(generated_image[0])
plt.axis('off')
plt.show()
