In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from scipy.spatial.distance import cosine
import matplotlib.pyplot as plt
import seaborn as sns
from skopt import gp_minimize
from skopt.space import Real, Integer
from skopt.utils import use_named_args
import warnings
import ast
import re
from collections import Counter, defaultdict
from itertools import combinations

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(42)
np.random.seed(42)
warnings.filterwarnings('ignore')

print(f"Using device: {device}")
print(f"CUDA available: {torch.cuda.is_available()}")

  "cipher": algorithms.TripleDES,
  "class": algorithms.TripleDES,


Using device: cuda
CUDA available: True


In [2]:
data_path = "data/movies/"

movies_df = pd.read_csv(data_path + "movies_metadata.csv", low_memory=False)
credits_df = pd.read_csv(data_path + "credits.csv")
ratings_df = pd.read_csv(data_path + "ratings.csv")

print("=== Movies Metadata ===")
print(f"Shape: {movies_df.shape}")
print(f"Columns: {list(movies_df.columns)}")
print("\nSample:")
print(movies_df.head(2))

print("\n=== Credits ===")
print(f"Shape: {credits_df.shape}")
print(f"Columns: {list(credits_df.columns)}")
print("\nSample:")
print(credits_df.head(2))

print("\n=== Ratings ===")
print(f"Shape: {ratings_df.shape}")
print(f"Columns: {list(ratings_df.columns)}")
print("\nSample:")
print(ratings_df.head(2))

=== Movies Metadata ===
Shape: (45466, 24)
Columns: ['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id', 'imdb_id', 'original_language', 'original_title', 'overview', 'popularity', 'poster_path', 'production_companies', 'production_countries', 'release_date', 'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title', 'video', 'vote_average', 'vote_count']

Sample:
   adult                              belongs_to_collection    budget  \
0  False  {'id': 10194, 'name': 'Toy Story Collection', ...  30000000   
1  False                                                NaN  65000000   

                                              genres  \
0  [{'id': 16, 'name': 'Animation'}, {'id': 35, '...   
1  [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...   

                               homepage    id    imdb_id original_language  \
0  http://toystory.disney.com/toy-story   862  tt0114709                en   
1                                   NaN  8844  tt011349

In [3]:
def safe_parse_json_field(field):
    try:
        if pd.isna(field):
            return []
        return ast.literal_eval(field)
    except:
        return []

def extract_names(json_list, key='name'):
    if not json_list:
        return []
    return [item.get(key, '') for item in json_list if isinstance(item, dict)]

def extract_directors(crew_list):
    if not crew_list:
        return []
    return [item.get('name', '') for item in crew_list
            if isinstance(item, dict) and item.get('job') == 'Director']

print("Processing genres...")
movies_df['genres_parsed'] = movies_df['genres'].apply(safe_parse_json_field)
movies_df['genres_list'] = movies_df['genres_parsed'].apply(lambda x: extract_names(x))
movies_df['genres_str'] = movies_df['genres_list'].apply(lambda x: ' '.join(x))

print("Processing credits...")
credits_df['cast_parsed'] = credits_df['cast'].apply(safe_parse_json_field)
credits_df['crew_parsed'] = credits_df['crew'].apply(safe_parse_json_field)
credits_df['actors_list'] = credits_df['cast_parsed'].apply(lambda x: extract_names(x))
credits_df['directors_list'] = credits_df['crew_parsed'].apply(extract_directors)
credits_df['actors_str'] = credits_df['actors_list'].apply(lambda x: ' '.join(x[:5]))
credits_df['directors_str'] = credits_df['directors_list'].apply(lambda x: ' '.join(x))

print("Fixing id types...")
movies_df['id'] = pd.to_numeric(movies_df['id'], errors='coerce')
credits_df['id'] = pd.to_numeric(credits_df['id'], errors='coerce')
movies_df = movies_df.dropna(subset=['id'])
credits_df = credits_df.dropna(subset=['id'])
movies_df['id'] = movies_df['id'].astype(int)
credits_df['id'] = credits_df['id'].astype(int)

print("Merging datasets...")
df = movies_df.merge(credits_df[['id', 'actors_str', 'directors_str', 'actors_list', 'directors_list']],
                     on='id', how='inner')

print("Cleaning data...")
df = df.dropna(subset=['overview', 'release_date', 'genres_str'])
df = df[df['overview'].str.len() > 20]
df = df[df['genres_str'].str.len() > 0]
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
df = df.dropna(subset=['release_date'])
df['year'] = df['release_date'].dt.year
df = df[(df['year'] >= 1990) & (df['year'] <= 2023)]

print("Creating temporal splits...")
train_df = df[df['year'] < 2010].copy()
val_df = df[(df['year'] >= 2010) & (df['year'] < 2015)].copy()
test_df = df[df['year'] >= 2015].copy()

print("Creating ground truth from vote_average...")
df['is_high_rated'] = (df['vote_average'] >= 6.0) & (df['vote_count'] >= 10)

print(f"Final dataset shape: {df.shape}")
print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")
print(f"High rated movies: {df['is_high_rated'].sum()}")
print(f"\nSample processed data:")
print(df[['title', 'year', 'genres_str', 'actors_str', 'directors_str', 'is_high_rated']].head(3))

Processing genres...
Processing credits...
Fixing id types...
Merging datasets...
Cleaning data...
Creating temporal splits...
Creating ground truth from vote_average...
Final dataset shape: (27610, 33)
Train: 15576, Val: 8171, Test: 3863
High rated movies: 9536

Sample processed data:
              title  year                genres_str  \
0         Toy Story  1995   Animation Comedy Family   
1           Jumanji  1995  Adventure Fantasy Family   
2  Grumpier Old Men  1995            Romance Comedy   

                                          actors_str  directors_str  \
0  Tom Hanks Tim Allen Don Rickles Jim Varney Wal...  John Lasseter   
1  Robin Williams Jonathan Hyde Kirsten Dunst Bra...   Joe Johnston   
2  Walter Matthau Jack Lemmon Ann-Margret Sophia ...  Howard Deutch   

   is_high_rated  
0           True  
1           True  
2           True  


In [4]:
WORD2VEC_DIM = 100
TOP_ACTORS = 500
TOP_DIRECTORS = 200

def preprocess_text(text):
    return re.sub(r'[^a-zA-Z\s]', '', str(text).lower()).split()

print("Preparing corpus for Word2Vec...")
corpus = []
for _, row in train_df.iterrows():
    overview_words = preprocess_text(row['overview'])
    genre_words = preprocess_text(row['genres_str'])
    actor_words = preprocess_text(row['actors_str'])
    director_words = preprocess_text(row['directors_str'])
    corpus.append(overview_words + genre_words + actor_words + director_words)

print("Training Word2Vec model...")
w2v_model = Word2Vec(sentences=corpus, vector_size=WORD2VEC_DIM, window=5,
                     min_count=2, workers=4, epochs=10, seed=42)

def get_word2vec_vector(text, model):
    words = preprocess_text(text)
    vectors = [model.wv[word] for word in words if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

print("Creating overview vectors...")
df['overview_vector'] = df['overview'].apply(lambda x: get_word2vec_vector(x, w2v_model))

print("Creating genre vectors...")
df['genre_vector'] = df['genres_str'].apply(lambda x: get_word2vec_vector(x, w2v_model))

print("Finding top actors and directors...")
all_actors = []
all_directors = []
for _, row in train_df.iterrows():
    all_actors.extend(row['actors_list'][:5])
    all_directors.extend(row['directors_list'])

actor_counts = Counter(all_actors)
director_counts = Counter(all_directors)
top_actors = [actor for actor, _ in actor_counts.most_common(TOP_ACTORS)]
top_directors = [director for director, _ in director_counts.most_common(TOP_DIRECTORS)]

print(f"Top actors: {len(top_actors)}, Top directors: {len(top_directors)}")

def create_person_vector(person_list, top_persons):
    vector = np.zeros(len(top_persons))
    for person in person_list:
        if person in top_persons:
            idx = top_persons.index(person)
            vector[idx] = 1
    return vector

print("Creating actor and director vectors...")
df['actor_vector'] = df['actors_list'].apply(lambda x: create_person_vector(x[:5], top_actors))
df['director_vector'] = df['directors_list'].apply(lambda x: create_person_vector(x, top_directors))

print("Creating temporal context features...")
year_scaler = MinMaxScaler()
df['year_normalized'] = year_scaler.fit_transform(df[['year']])
df['era'] = pd.cut(df['year'], bins=[1990, 2000, 2010, 2017, 2025],
                   labels=['90s', '2000s', '2010s', '2020s'])
era_encoder = LabelEncoder()
df['era_encoded'] = era_encoder.fit_transform(df['era'])

print("Final feature dimensions:")
print(f"Overview: {WORD2VEC_DIM}")
print(f"Genre: {WORD2VEC_DIM}")
print(f"Actor: {len(top_actors)}")
print(f"Director: {len(top_directors)}")
print(f"Total feature dimension: {WORD2VEC_DIM * 2 + len(top_actors) + len(top_directors)}")

print("\nSample vectors:")
sample_idx = 0
print(f"Movie: {df.iloc[sample_idx]['title']}")
print(f"Overview vector shape: {df.iloc[sample_idx]['overview_vector'].shape}")
print(f"Genre vector shape: {df.iloc[sample_idx]['genre_vector'].shape}")
print(f"Actor vector shape: {df.iloc[sample_idx]['actor_vector'].shape}")
print(f"Director vector shape: {df.iloc[sample_idx]['director_vector'].shape}")

Preparing corpus for Word2Vec...
Training Word2Vec model...
Creating overview vectors...
Creating genre vectors...
Finding top actors and directors...
Top actors: 500, Top directors: 200
Creating actor and director vectors...
Creating temporal context features...
Final feature dimensions:
Overview: 100
Genre: 100
Actor: 500
Director: 200
Total feature dimension: 900

Sample vectors:
Movie: Toy Story
Overview vector shape: (100,)
Genre vector shape: (100,)
Actor vector shape: (500,)
Director vector shape: (200,)


In [5]:
class AttentionNetwork(nn.Module):
    def __init__(self, context_dim=6, hidden_dims=[128, 64, 32], dropout=0.3):
        super(AttentionNetwork, self).__init__()

        layers = []
        input_dim = context_dim

        for hidden_dim in hidden_dims:
            layers.extend([
                nn.Linear(input_dim, hidden_dim),
                nn.BatchNorm1d(hidden_dim),
                nn.ReLU(),
                nn.Dropout(dropout)
            ])
            input_dim = hidden_dim

        layers.append(nn.Linear(input_dim, 4))
        layers.append(nn.Softmax(dim=1))

        self.network = nn.Sequential(*layers)

    def forward(self, context):
        return self.network(context)

def compute_context_vector(movie1_idx, movie2_idx, df):
    m1 = df.iloc[movie1_idx]
    m2 = df.iloc[movie2_idx]

    genre_sim = cosine_similarity([m1['genre_vector']], [m2['genre_vector']])[0][0]

    year_diff = abs(m1['year'] - m2['year']) / 50.0
    year_diff = min(year_diff, 1.0)

    era_same = float(m1['era_encoded'] == m2['era_encoded'])

    rating_diff = abs(m1['vote_average'] - m2['vote_average']) / 10.0

    pop1 = pd.to_numeric(m1['popularity'], errors='coerce')
    pop2 = pd.to_numeric(m2['popularity'], errors='coerce')
    if pd.isna(pop1): pop1 = 1.0
    if pd.isna(pop2): pop2 = 1.0
    popularity_sim = 1.0 - abs(pop1 - pop2) / max(pop1, pop2, 1.0)
    popularity_sim = max(popularity_sim, 0.0)

    runtime1 = pd.to_numeric(m1.get('runtime', 100), errors='coerce')
    runtime2 = pd.to_numeric(m2.get('runtime', 100), errors='coerce')
    if pd.isna(runtime1): runtime1 = 100.0
    if pd.isna(runtime2): runtime2 = 100.0
    runtime_sim = 1.0 - abs(runtime1 - runtime2) / 200.0
    runtime_sim = max(runtime_sim, 0.0)

    context = np.array([genre_sim, year_diff, era_same, rating_diff, popularity_sim, runtime_sim])
    return context

def compute_feature_similarities(movie1_idx, movie2_idx, df):
    m1 = df.iloc[movie1_idx]
    m2 = df.iloc[movie2_idx]

    overview_sim = cosine_similarity([m1['overview_vector']], [m2['overview_vector']])[0][0]
    genre_sim = cosine_similarity([m1['genre_vector']], [m2['genre_vector']])[0][0]

    actor_sim = cosine_similarity([m1['actor_vector']], [m2['actor_vector']])[0][0]
    director_sim = cosine_similarity([m1['director_vector']], [m2['director_vector']])[0][0]

    return np.array([overview_sim, genre_sim, actor_sim, director_sim])

def compute_attention_similarity(movie1_idx, movie2_idx, df, attention_model):
    context = compute_context_vector(movie1_idx, movie2_idx, df)
    context_tensor = torch.FloatTensor(context).unsqueeze(0).to(device)

    attention_model.eval()
    with torch.no_grad():
        attention_weights = attention_model(context_tensor).cpu().numpy()[0]

    feature_sims = compute_feature_similarities(movie1_idx, movie2_idx, df)

    final_similarity = np.dot(attention_weights, feature_sims)

    return final_similarity, attention_weights

print("Initializing attention model...")
attention_model = AttentionNetwork(context_dim=6, hidden_dims=[128, 64, 32], dropout=0.3)
attention_model = attention_model.to(device)

print(f"Model architecture:")
print(attention_model)

print(f"Model parameters: {sum(p.numel() for p in attention_model.parameters())}")

print("\nTesting attention mechanism...")
test_sim, test_weights = compute_attention_similarity(0, 1, df, attention_model)
print(f"Test similarity: {test_sim:.4f}")
print(f"Test attention weights: {test_weights}")
print(f"Weight names: ['overview', 'genre', 'actor', 'director']")

print("\nContext vector example:")
test_context = compute_context_vector(0, 1, df)
print(f"Context features: {test_context}")
print(f"Context names: ['genre_sim', 'year_diff', 'era_same', 'rating_diff', 'popularity_sim', 'runtime_sim']")

Initializing attention model...
Model architecture:
AttentionNetwork(
  (network): Sequential(
    (0): Linear(in_features=6, out_features=128, bias=True)
    (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.3, inplace=False)
    (4): Linear(in_features=128, out_features=64, bias=True)
    (5): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): Dropout(p=0.3, inplace=False)
    (8): Linear(in_features=64, out_features=32, bias=True)
    (9): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): ReLU()
    (11): Dropout(p=0.3, inplace=False)
    (12): Linear(in_features=32, out_features=4, bias=True)
    (13): Softmax(dim=1)
  )
)
Model parameters: 11812

Testing attention mechanism...
Test similarity: 0.4620
Test attention weights: [0.25838992 0.24908654 0.26302537 0.22949815]
Weight names: ['overview', 'genre', 'actor', 'di

In [6]:
VALIDATION_PAIRS = 1000

print("Generating validation pairs...")
val_indices = val_df.index.tolist()
np.random.shuffle(val_indices)

validation_pairs = []
ground_truth = []

pairs_generated = 0
attempts = 0
max_attempts = VALIDATION_PAIRS * 10

while pairs_generated < VALIDATION_PAIRS and attempts < max_attempts:
    idx1 = np.random.choice(val_indices)
    idx2 = np.random.choice(val_indices)

    if idx1 != idx2:
        movie1_idx = df.index.get_loc(idx1)
        movie2_idx = df.index.get_loc(idx2)

        m1 = df.loc[idx1]
        m2 = df.loc[idx2]

        if m1['is_high_rated'] and m2['is_high_rated']:
            similarity_score = 1.0
        elif not m1['is_high_rated'] and not m2['is_high_rated']:
            genre_overlap = len(set(m1['genres_list']) & set(m2['genres_list'])) > 0
            similarity_score = 0.5 if genre_overlap else 0.0
        else:
            similarity_score = 0.0

        validation_pairs.append((movie1_idx, movie2_idx))
        ground_truth.append(similarity_score)
        pairs_generated += 1

    attempts += 1

print(f"Generated {len(validation_pairs)} validation pairs")
print(f"Ground truth distribution:")
print(f"High similarity (1.0): {ground_truth.count(1.0)}")
print(f"Medium similarity (0.5): {ground_truth.count(0.5)}")
print(f"Low similarity (0.0): {ground_truth.count(0.0)}")

print("\nSample pairs:")
for i in range(3):
    m1_idx, m2_idx = validation_pairs[i]
    m1_title = df.iloc[m1_idx]['title']
    m2_title = df.iloc[m2_idx]['title']
    gt = ground_truth[i]
    print(f"Pair {i+1}: '{m1_title}' vs '{m2_title}' - GT: {gt}")

Generating validation pairs...
Generated 1000 validation pairs
Ground truth distribution:
High similarity (1.0): 107
Medium similarity (0.5): 149
Low similarity (0.0): 744

Sample pairs:
Pair 1: 'Last Hijack' vs 'Somewhere Between' - GT: 0.5
Pair 2: 'Win/Win' vs 'Honeymoon' - GT: 0.5
Pair 3: 'Eco-Pirate: The Story of Paul Watson' vs 'The Staircase II: The Last Chance' - GT: 0.5


In [7]:
def compute_metrics(predicted_similarities, ground_truth, k=5):
    paired_data = list(zip(predicted_similarities, ground_truth))
    paired_data.sort(key=lambda x: x[0], reverse=True)

    precision_at_k = sum(1 for i, (_, gt) in enumerate(paired_data[:k]) if gt >= 0.5) / k

    ap_scores = []
    for threshold in [0.5, 1.0]:
        relevant_items = [i for i, gt in enumerate(ground_truth) if gt >= threshold]
        if not relevant_items:
            continue

        precision_scores = []
        for i, (_, gt) in enumerate(paired_data):
            if gt >= threshold:
                precision_at_i = sum(1 for j in range(i+1) if paired_data[j][1] >= threshold) / (i+1)
                precision_scores.append(precision_at_i)

        if precision_scores:
            ap_scores.append(np.mean(precision_scores))

    map_score = np.mean(ap_scores) if ap_scores else 0.0

    diversity_score = np.std(predicted_similarities) if len(predicted_similarities) > 1 else 0.0

    return precision_at_k, map_score, diversity_score

def evaluate_attention_model(attention_model, validation_pairs, ground_truth):
    attention_model.eval()
    predicted_similarities = []

    for movie1_idx, movie2_idx in validation_pairs:
        try:
            sim_score, _ = compute_attention_similarity(movie1_idx, movie2_idx, df, attention_model)
            predicted_similarities.append(float(sim_score))
        except:
            predicted_similarities.append(0.0)

    precision_k, map_score, diversity = compute_metrics(predicted_similarities, ground_truth)

    combined_score = 0.6 * map_score + 0.3 * precision_k + 0.1 * diversity

    return combined_score, precision_k, map_score, diversity

print("Testing objective function with current model...")
current_score, prec, map_val, div = evaluate_attention_model(attention_model, validation_pairs, ground_truth)

print(f"Current model performance:")
print(f"Combined Score: {current_score:.4f}")
print(f"Precision@5: {prec:.4f}")
print(f"MAP: {map_val:.4f}")
print(f"Diversity: {div:.4f}")

print(f"\nObjective function weights: 0.6*MAP + 0.3*Precision@5 + 0.1*Diversity")
print("Ready for Bayesian Optimization...")

Testing objective function with current model...
Current model performance:
Combined Score: 0.4378
Precision@5: 0.8000
MAP: 0.3177
Diversity: 0.0716

Objective function weights: 0.6*MAP + 0.3*Precision@5 + 0.1*Diversity
Ready for Bayesian Optimization...


In [8]:
from skopt.space import Real, Integer

parameter_space = [
    Real(0.001, 0.1, name='learning_rate', prior='log-uniform'),
    Real(0.1, 0.5, name='dropout_rate'),
    Integer(32, 256, name='hidden_dim1'),
    Integer(16, 128, name='hidden_dim2'),
    Integer(8, 64, name='hidden_dim3'),
    Real(0.4, 0.8, name='map_weight'),
    Real(0.1, 0.4, name='precision_weight'),
    Real(0.05, 0.2, name='diversity_weight'),
    Integer(5, 50, name='training_epochs')
]

def create_optimized_model(params):
    if isinstance(params[0], list):
        params = [p[0] for p in params]

    lr, dropout, h1, h2, h3, map_w, prec_w, div_w, epochs = params

    model = AttentionNetwork(context_dim=6,
                           hidden_dims=[int(h1), int(h2), int(h3)],
                           dropout=float(dropout))
    model = model.to(device)

    config = {
        'learning_rate': float(lr),
        'map_weight': float(map_w),
        'precision_weight': float(prec_w),
        'diversity_weight': float(div_w),
        'training_epochs': int(epochs)
    }

    return model, config

test_model, test_config = create_optimized_model([param.rvs() for param in parameter_space])
print(f"Parameter space ready. Test model: {sum(p.numel() for p in test_model.parameters())} params")

Parameter space ready. Test model: 9337 params


In [9]:
def train_attention_model(model, config):
    model.train()
    optimizer = optim.Adam(model.parameters(), lr=config['learning_rate'])

    for epoch in range(config['training_epochs']):
        total_loss = 0
        batch_pairs = np.random.choice(len(validation_pairs), size=min(50, len(validation_pairs)), replace=False)

        for idx in batch_pairs:
            movie1_idx, movie2_idx = validation_pairs[idx]
            target_sim = torch.FloatTensor([ground_truth[idx]]).to(device)

            try:
                context = compute_context_vector(movie1_idx, movie2_idx, df)
                context_tensor = torch.FloatTensor(context).unsqueeze(0).to(device)

                attention_weights = model(context_tensor)
                feature_sims = compute_feature_similarities(movie1_idx, movie2_idx, df)
                feature_sims_tensor = torch.FloatTensor(feature_sims).unsqueeze(0).to(device)

                predicted_sim = torch.sum(attention_weights * feature_sims_tensor, dim=1)
                loss = nn.MSELoss()(predicted_sim, target_sim)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                total_loss += loss.item()
            except:
                continue

    model.eval()

@use_named_args(parameter_space)
def objective(**params):
    try:
        param_values = [params[param.name] for param in parameter_space]
        model, config = create_optimized_model(param_values)

        train_attention_model(model, config)

        predicted_similarities = []
        for movie1_idx, movie2_idx in validation_pairs:
            try:
                sim_score, _ = compute_attention_similarity(movie1_idx, movie2_idx, df, model)
                predicted_similarities.append(float(sim_score))
            except:
                predicted_similarities.append(0.0)

        paired_data = list(zip(predicted_similarities, ground_truth))
        paired_data.sort(key=lambda x: x[0], reverse=True)

        precision_at_5 = sum(1 for i, (_, gt) in enumerate(paired_data[:5]) if gt >= 0.5) / 5

        relevant_indices = [i for i, gt in enumerate(ground_truth) if gt >= 0.5]
        if relevant_indices:
            ap_scores = []
            for i, (_, gt) in enumerate(paired_data):
                if gt >= 0.5:
                    precision_at_i = sum(1 for j in range(i+1) if paired_data[j][1] >= 0.5) / (i+1)
                    ap_scores.append(precision_at_i)
            map_score = np.mean(ap_scores) if ap_scores else 0.0
        else:
            map_score = 0.0

        diversity = np.std(predicted_similarities) if len(predicted_similarities) > 1 else 0.0

        combined_score = (config['map_weight'] * map_score +
                         config['precision_weight'] * precision_at_5 +
                         config['diversity_weight'] * diversity)

        return -combined_score

    except Exception as e:
        print(f"Error in objective: {e}")
        return -0.1

print("Starting Bayesian Optimization with full training...")
print(f"Baseline score: {current_score:.4f}")

result = gp_minimize(objective, parameter_space, n_calls=15,
                    random_state=42, acq_func='EI', verbose=True)

print(f"\nOptimization completed!")
print(f"Best score: {-result.fun:.4f}")
print(f"Improvement: {-result.fun - current_score:.4f}")

best_params = result.x
best_model, best_config = create_optimized_model(best_params)
train_attention_model(best_model, best_config)

print(f"\nBest parameters:")
for i, param in enumerate(parameter_space):
    value = best_params[i]
    if isinstance(value, list):
        value = value[0]
    print(f"{param.name}: {value}")

final_score, final_prec, final_map, final_div = evaluate_attention_model(best_model, validation_pairs, ground_truth)
print(f"\nFinal performance:")
print(f"Combined Score: {final_score:.4f}")
print(f"Precision@5: {final_prec:.4f}")
print(f"MAP: {final_map:.4f}")
print(f"Diversity: {final_div:.4f}")

optimized_attention_model = best_model

Starting Bayesian Optimization with full training...
Baseline score: 0.4378
Iteration No: 1 started. Evaluating function at random point.
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 3.6156
Function value obtained: -0.4252
Current minimum: -0.4252
Iteration No: 2 started. Evaluating function at random point.
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 2.2350
Function value obtained: -0.6534
Current minimum: -0.6534
Iteration No: 3 started. Evaluating function at random point.
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 2.8700
Function value obtained: -0.3423
Current minimum: -0.6534
Iteration No: 4 started. Evaluating function at random point.
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 2.9760
Function value obtained: -0.5249
Current minimum: -0.6534
Iteration No: 5 started. Evaluating function at random point.
Iteration No: 5 ended. Evaluation done at random point.
Time taken: 2.6510
Functio

In [10]:
def get_movie_recommendations(movie_title, model, top_k=10):
    try:
        movie_row = df[df['title'].str.contains(movie_title, case=False, na=False)].iloc[0]
        target_idx = df.index.get_loc(movie_row.name)

        similarities = []
        for idx in df.index:
            if idx != movie_row.name:
                candidate_idx = df.index.get_loc(idx)
                try:
                    sim_score, attention_weights = compute_attention_similarity(target_idx, candidate_idx, df, model)
                    similarities.append((idx, sim_score, attention_weights))
                except:
                    continue

        similarities.sort(key=lambda x: x[1], reverse=True)

        recommendations = []
        for idx, sim_score, att_weights in similarities[:top_k]:
            movie_info = df.loc[idx]
            recommendations.append({
                'title': movie_info['title'],
                'year': movie_info['year'],
                'genres': movie_info['genres_str'],
                'similarity': sim_score,
                'attention_weights': att_weights,
                'rating': movie_info['vote_average']
            })

        return movie_row, recommendations

    except Exception as e:
        print(f"Error getting recommendations: {e}")
        return None, []

def analyze_attention_patterns(model, sample_pairs=50):
    attention_analysis = {
        'high_similarity': [],
        'medium_similarity': [],
        'low_similarity': []
    }

    for i in range(min(sample_pairs, len(validation_pairs))):
        movie1_idx, movie2_idx = validation_pairs[i]
        gt = ground_truth[i]

        try:
            _, attention_weights = compute_attention_similarity(movie1_idx, movie2_idx, df, model)

            if gt >= 0.8:
                attention_analysis['high_similarity'].append(attention_weights)
            elif gt >= 0.3:
                attention_analysis['medium_similarity'].append(attention_weights)
            else:
                attention_analysis['low_similarity'].append(attention_weights)
        except:
            continue

    avg_attention = {}
    for category, weights_list in attention_analysis.items():
        if weights_list:
            avg_attention[category] = np.mean(weights_list, axis=0)

    return avg_attention

print("=== FINAL EVALUATION ON TEST SET ===")
if len(test_df) > 10:
    test_indices = test_df.index.tolist()[:50]  # Sample from test set
    test_pairs = []
    test_ground_truth = []

    for i in range(min(100, len(test_indices))):
        idx1 = np.random.choice(test_indices)
        idx2 = np.random.choice(test_indices)
        if idx1 != idx2:
            movie1_idx = df.index.get_loc(idx1)
            movie2_idx = df.index.get_loc(idx2)
            m1, m2 = df.loc[idx1], df.loc[idx2]

            if m1['is_high_rated'] and m2['is_high_rated']:
                gt = 1.0
            elif len(set(m1['genres_list']) & set(m2['genres_list'])) > 0:
                gt = 0.5
            else:
                gt = 0.0

            test_pairs.append((movie1_idx, movie2_idx))
            test_ground_truth.append(gt)

    test_score, test_prec, test_map, test_div = evaluate_attention_model(optimized_attention_model, test_pairs, test_ground_truth)
    print(f"Test Set Performance:")
    print(f"Combined Score: {test_score:.4f}")
    print(f"Precision@5: {test_prec:.4f}")
    print(f"MAP: {test_map:.4f}")
    print(f"Diversity: {test_div:.4f}")
else:
    print("Test set too small for comprehensive evaluation")

print("\n=== MOVIE RECOMMENDATIONS ===")
sample_movies = ['Toy Story', 'The Matrix', 'Titanic', 'Avatar']

for movie_title in sample_movies:
    print(f"\n--- Recommendations for '{movie_title}' ---")
    target_movie, recs = get_movie_recommendations(movie_title, optimized_attention_model, top_k=5)

    if target_movie is not None:
        print(f"Target: {target_movie['title']} ({target_movie['year']}) - {target_movie['genres_str']}")

        for i, rec in enumerate(recs, 1):
            att_str = f"[{rec['attention_weights'][0]:.2f}, {rec['attention_weights'][1]:.2f}, {rec['attention_weights'][2]:.2f}, {rec['attention_weights'][3]:.2f}]"
            print(f"{i}. {rec['title']} ({rec['year']}) - Sim: {rec['similarity']:.3f} - Att: {att_str}")
            print(f"   Genres: {rec['genres']}")

print("\n=== ATTENTION PATTERN ANALYSIS ===")
attention_patterns = analyze_attention_patterns(optimized_attention_model)

feature_names = ['Overview', 'Genre', 'Actor', 'Director']
for category, avg_weights in attention_patterns.items():
    print(f"\n{category.replace('_', ' ').title()} pairs:")
    for i, weight in enumerate(avg_weights):
        print(f"  {feature_names[i]}: {weight:.3f}")

print("\n=== SUMMARY ===")
print(f"Baseline model: {current_score:.4f}")
print(f"Optimized model: {final_score:.4f}")
print(f"Improvement: {final_score - current_score:.4f} ({((final_score - current_score) / current_score * 100):+.1f}%)")

=== FINAL EVALUATION ON TEST SET ===
Test Set Performance:
Combined Score: 0.6940
Precision@5: 1.0000
MAP: 0.6484
Diversity: 0.0490

=== MOVIE RECOMMENDATIONS ===

--- Recommendations for 'Toy Story' ---
Target: Toy Story (1995) - Animation Comedy Family
1. Partysaurus Rex (2012) - Sim: 0.677 - Att: [0.27, 0.20, 0.22, 0.30]
   Genres: Animation Comedy Family Fantasy
2. Toy Story of Terror! (2013) - Sim: 0.677 - Att: [0.27, 0.20, 0.22, 0.30]
   Genres: Animation Comedy Family
3. Toy Story 2 (1999) - Sim: 0.638 - Att: [0.27, 0.20, 0.22, 0.30]
   Genres: Animation Comedy Family
4. Toy Story That Time Forgot (2014) - Sim: 0.617 - Att: [0.28, 0.20, 0.22, 0.30]
   Genres: Animation Family
5. Hawaiian Vacation (2011) - Sim: 0.607 - Att: [0.27, 0.20, 0.22, 0.30]
   Genres: Animation Family

--- Recommendations for 'The Matrix' ---
Target: The Matrix (1999) - Action Science Fiction
1. The Matrix Revolutions (2003) - Sim: 0.668 - Att: [0.27, 0.20, 0.22, 0.30]
   Genres: Adventure Action Thriller