# Database Initialization Tool

This notebook provides tools to manage the doctorate programs database:
1. Connect to MongoDB and view database status
2. Check which records need initialization/enrichment
3. Initialize all records
4. Initialize only records with missing fields

In [4]:
# Import necessary libraries
import os
import json
import requests
import pandas as pd
from datetime import datetime
from pymongo import MongoClient
from bson.objectid import ObjectId
from dotenv import load_dotenv
from IPython.display import display, HTML

In [3]:
!pip install pymongo requests pandas python-dotenv

Collecting pymongo
  Downloading pymongo-4.13.0-cp311-cp311-win_amd64.whl.metadata (22 kB)
Collecting dnspython<3.0.0,>=1.16.0 (from pymongo)
  Downloading dnspython-2.7.0-py3-none-any.whl.metadata (5.8 kB)
Downloading pymongo-4.13.0-cp311-cp311-win_amd64.whl (848 kB)
   ---------------------------------------- 0.0/848.1 kB ? eta -:--:--
   ---------------------------------------- 848.1/848.1 kB 7.5 MB/s eta 0:00:00
Downloading dnspython-2.7.0-py3-none-any.whl (313 kB)
Installing collected packages: dnspython, pymongo
Successfully installed dnspython-2.7.0 pymongo-4.13.0



[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


## Connect to MongoDB

In [14]:
# Load environment variables from .env file
load_dotenv()

# MongoDB Connection
MONGODB_URI = os.getenv('MONGODB_URI', 'mongodb://localhost:27017/doctorados')
# OpenAI API Configuration
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

def connect_to_mongo():
    """
    Connect to MongoDB
    Returns MongoDB database connection
    """
    try:
        client = MongoClient(MONGODB_URI)
        print('Connected to MongoDB')
        return client.get_database()
    except Exception as error:
        print(f'Error connecting to MongoDB: {error}')
        raise error

In [6]:
# Connect to database and display basic statistics
db = connect_to_mongo()
collection = db['programas']

# Get collection stats
total_programs = collection.count_documents({})
total_universities = len(collection.distinct('universidad'))
total_cities = len(collection.distinct('ciudad'))

print(f"Database Statistics:")
print(f"Total Programs: {total_programs}")
print(f"Total Universities: {total_universities}")
print(f"Total Cities: {total_cities}")

# Sample record
print("\nSample record:")
sample = collection.find_one({})
print(json.dumps(sample, default=str, indent=2))

Connected to MongoDB
Database Statistics:
Total Programs: 167
Total Universities: 46
Total Cities: 36

Sample record:
{
  "_id": "684125cbe780dc9d4ac52474",
  "universidad": "Universidad de Navarra",
  "ciudad": "Pamplona",
  "programa": "Doctorado en Medicamentos y Salud",
  "linea_investigacion": "\u00b7 L\u00ednea 1: Nuevas mol\u00e9culas con aplicabilidad biol\u00f3gica, estudios de farmacolog\u00eda b\u00e1sica y cl\u00ednica\n\n\u00b7 L\u00ednea 2: Dise\u00f1o Gal\u00e9nico, Farmacocin\u00e9tica y uso racional de los medicamentos",
  "url": "https://www.unav.edu/web/programa-de-doctorado-en-medicamentos-y-salud"
}


## Check Records Needing Initialization

This section identifies records that need initialization by checking for missing fields.

In [ ]:
def check_missing_fields(collection_name='programas'):
    """
    Check which records are missing required fields
    Returns a DataFrame with missing field statistics
    """
    db = connect_to_mongo()
    collection = db[collection_name]
    
    # Required fields that should be present in each record
    required_fields = [
        'resumen',
        'coords',
        'stats',
        'city_metrics',
        'university_summary',
        'ultimo_enriquecimiento',
        'city_description',      # New required field
        'university_description' # New required field
    ]
    
    # Count records missing each field
    missing_counts = {}
    for field in required_fields:
        query = {field: {"$exists": False}}
        missing_counts[field] = collection.count_documents(query)
    
    # Find records missing any required field
    query = {
        "$or": [{field: {"$exists": False}} for field in required_fields]
    }
    total_missing_any = collection.count_documents(query)
    
    # Get list of records missing any field
    missing_records = list(collection.find(query, {"_id": 1, "universidad": 1, "ciudad": 1, "programa": 1}))
    
    # Create results DataFrame
    results = pd.DataFrame(missing_counts.items(), columns=['Field', 'Count'])
    results = results.sort_values('Count', ascending=False)
    
    return {
        'statistics': results,
        'total_missing_any': total_missing_any,
        'missing_records': missing_records
    }

# Check missing fields
missing_data = check_missing_fields()

print(f"Records missing any required field: {missing_data['total_missing_any']}")
display(missing_data['statistics'])

# Display sample of records needing initialization
if missing_data['missing_records']:
    print("\nSample of records needing initialization:")
    sample_size = min(5, len(missing_data['missing_records']))
    for i in range(sample_size):
        print(json.dumps(missing_data['missing_records'][i], default=str, indent=2))

## Implementing Data Enrichment Functions

Port of JavaScript functions for Python

In [8]:
# Backup functions

def backup_collection(collection_name='programas', backup_prefix='backup'):
    """
    Create a backup of a MongoDB collection
    
    Args:
        collection_name: Name of the collection to backup
        backup_prefix: Prefix for the backup file
        
    Returns:
        Path to the backup file or None if no backup was created
    """
    try:
        db = connect_to_mongo()
        
        # Generate timestamp for the backup
        timestamp = datetime.now().isoformat().replace(':', '-').replace('.', '-')
        backup_file_name = f"{backup_prefix}_{collection_name}_{timestamp}.json"
        
        # Determine backup directory path
        backup_dir = os.getenv('BACKUP_DIR', os.path.join(os.getcwd(), 'backups'))
        
        # Ensure backup directory exists
        if not os.path.exists(backup_dir):
            os.makedirs(backup_dir, exist_ok=True)
            print(f"Created backup directory: {backup_dir}")
        
        backup_path = os.path.join(backup_dir, backup_file_name)
        
        # Get all documents from the collection
        docs = list(db[collection_name].find({}))
        
        # If no documents, show a message and exit
        if not docs or len(docs) == 0:
            print(f"Collection '{collection_name}' is empty. No backup created.")
            return None
        
        # Convert ObjectId to string for JSON serialization
        for doc in docs:
            doc['_id'] = str(doc['_id'])
        
        # Save documents to a JSON file
        with open(backup_path, 'w', encoding='utf-8') as f:
            json.dump(docs, f, ensure_ascii=False, indent=2)
        
        print(f"Backup successful: '{collection_name}' → '{backup_path}' ({len(docs)} documents)")
        return backup_path
    except Exception as error:
        print(f"Error creating backup: {error}")
        raise error

In [ ]:
# Data Enrichment Functions

def get_coordinates(city, country=None):
    """
    Get coordinates for a city using Nominatim/OpenStreetMap
    
    Args:
        city: City name
        country: Country name, will be determined from city if None
        
    Returns:
        Coordinates object or None
    """
    try:
        # Determine country based on city name if not provided
        if not country:
            # Portuguese cities
            portuguese_cities = ['Lisboa', 'Porto', 'Coimbra', 'Braga', 'Aveiro', 'Faro']
            if city in portuguese_cities or any(city.find(pc) >= 0 for pc in portuguese_cities) or 'Portugal' in city:
                country = 'Portugal'
            else:
                # Default to Spain
                country = 'Spain'
        
        params = {
            'format': 'json',
            'limit': 1
        }
        
        # If we have a city name, use it
        if city:
            params['city'] = city
        
        # Add country parameter
        if country:
            params['country'] = country
        
        headers = {
            'User-Agent': 'GraduateProgramsEvaluator/1.0'
        }
        
        response = requests.get(
            'https://nominatim.openstreetmap.org/search', 
            params=params,
            headers=headers
        )
        
        data = response.json()
        if data and len(data) > 0:
            return {
                'lat': float(data[0]['lat']),
                'lon': float(data[0]['lon'])
            }
        return None
    except Exception as error:
        print(f"Error getting coordinates for {city} in {country}: {error}")
        return None

def generate_summary(text):
    """
    Generate summary using OpenAI
    
    Args:
        text: Text to summarize
        
    Returns:
        Generated summary
    """
    try:
        headers = {
            'Authorization': f"Bearer {OPENAI_API_KEY}",
            'Content-Type': 'application/json'
        }
        
        payload = {
            'model': 'gpt-3.5-turbo',
            'messages': [
                {
                    'role': 'system',
                    'content': 'Eres un asistente académico especializado en resumir líneas de investigación científica de manera concisa y profesional.'
                },
                {
                    'role': 'user',
                    'content': f"Resume las siguientes líneas de investigación en un párrafo breve con bullets cuando sea posible, destacando los aspectos más importantes y potenciales aplicaciones: {text}"
                }
            ],
            'max_tokens': 200,
            'temperature': 0.7
        }
        
        response = requests.post(
            'https://api.openai.com/v1/chat/completions',
            headers=headers,
            json=payload
        )
        
        return response.json()['choices'][0]['message']['content'].strip()
    except Exception as error:
        print(f"Error generating summary: {error}")
        return "No se pudo generar un resumen."

def generate_university_summary(universidad, ciudad):
    """
    Generate university summary using OpenAI
    
    Args:
        universidad: University name
        ciudad: City name
        
    Returns:
        Generated university summary
    """
    try:
        headers = {
            'Authorization': f"Bearer {OPENAI_API_KEY}",
            'Content-Type': 'application/json'
        }
        
        payload = {
            'model': 'gpt-3.5-turbo',
            'messages': [
                {
                    'role': 'system',
                    'content': 'Eres un experto en educación superior. Resume en 4 bullets los puntos clave de la universidad indicada, explica por qué es destacada y menciona si aparece en algún ranking relevante (nacional en España, europeo o internacional), incluyendo el nombre del ranking y la posición si está disponible.'
                },
                {
                    'role': 'user',
                    'content': f"Universidad: {universidad} ({ciudad})\n\nDame un resumen en formato similar al siguiente ejemplo:\n\nEjemplo:\nUniversidad: Universidad de Barcelona\n\nFundada en 1450, es una de las universidades más antiguas de España.\n\nOfrece una amplia gama de programas en ciencias, humanidades y tecnología.\n\nDestaca por su investigación en biomedicina y ciencias sociales.\n\nFuerte red de colaboración internacional.\n\nPor qué es destacada: Reconocida por su excelencia académica, producción científica y alta empleabilidad de sus egresados.\nRankings: Aparece en el QS World University Rankings (#184) y es la 1ª en España según el ranking de Shanghai 2024."
                }
            ],
            'max_tokens': 400,
            'temperature': 0.7
        }
        
        response = requests.post(
            'https://api.openai.com/v1/chat/completions',
            headers=headers,
            json=payload
        )
        
        return response.json()['choices'][0]['message']['content'].strip()
    except Exception as error:
        print(f"Error generating university summary: {error}")
        return f"No se pudo generar un resumen para la Universidad {universidad}."

def generate_university_description(universidad, ciudad):
    """
    Generate a detailed university description with bullets using OpenAI
    
    Args:
        universidad: University name
        ciudad: City name
        
    Returns:
        Generated university description with bullets
    """
    try:
        headers = {
            'Authorization': f"Bearer {OPENAI_API_KEY}",
            'Content-Type': 'application/json'
        }
        
        payload = {
            'model': 'gpt-3.5-turbo',
            'messages': [
                {
                    'role': 'system',
                    'content': 'Eres un experto en educación superior que proporciona información concisa y útil para estudiantes de doctorado.'
                },
                {
                    'role': 'user',
                    'content': f"Genera una descripción detallada de la {universidad} ubicada en {ciudad} utilizando el siguiente formato con bullets:\n\n"
                              "* **Año de fundación**: [año] y un breve dato histórico relevante\n"
                              "* **Especialidades destacadas**: Menciona 2-3 áreas académicas por las que es más conocida\n"
                              "* **Instalaciones**: Principales campus o instalaciones de investigación\n"
                              "* **Reconocimientos**: Posiciones en rankings académicos (si se conocen)\n"
                              "* **Colaboraciones internacionales**: Principales colaboraciones o redes\n"
                              "* **Oportunidades para doctorandos**: Beneficios específicos para estudiantes de doctorado\n"
                              "* **Dato interesante**: Un dato único o curioso sobre la universidad\n\n"
                              "Usa formato markdown para la negrita (con ** alrededor del texto). Mantén cada bullet breve pero informativo."
                }
            ],
            'max_tokens': 500,
            'temperature': 0.7
        }
        
        response = requests.post(
            'https://api.openai.com/v1/chat/completions',
            headers=headers,
            json=payload
        )
        
        return response.json()['choices'][0]['message']['content'].strip()
    except Exception as error:
        print(f"Error generating university description: {error}")
        return f"* No se pudo generar una descripción detallada para la Universidad {universidad}."

def generate_city_description(ciudad):
    """
    Generate a detailed city description with bullets using OpenAI
    
    Args:
        ciudad: City name
        
    Returns:
        Generated city description with bullets
    """
    try:
        # Determine if the city is in Portugal
        portuguese_cities = ['Lisboa', 'Porto', 'Coimbra', 'Braga', 'Aveiro', 'Faro']
        is_portuguese_city = (ciudad in portuguese_cities or 
                             any(ciudad.find(pc) >= 0 for pc in portuguese_cities) or
                             'Portugal' in ciudad)
        
        country = "Portugal" if is_portuguese_city else "España"
        
        headers = {
            'Authorization': f"Bearer {OPENAI_API_KEY}",
            'Content-Type': 'application/json'
        }
        
        payload = {
            'model': 'gpt-3.5-turbo',
            'messages': [
                {
                    'role': 'system',
                    'content': 'Eres un asistente especializado en datos demográficos, turísticos y culturales de ciudades. Proporciona información concisa en formato de bullets con texto en negrita para destacar elementos importantes.'
                },
                {
                    'role': 'user',
                    'content': f"Genera una descripción detallada de {ciudad}, {country} con la siguiente información en formato de bullets:\n\n"
                              "* **Población**: Aproximada y si es posible menciona alguna característica demográfica relevante\n"
                              "* **Conocida por**: Principales atracciones o características por las que es famosa\n"
                              "* **Clima**: Temperatura media anual, mínima y máxima, y características estacionales\n"
                              "* **Economía**: Principales industrias o sectores económicos\n"
                              "* **Educación**: Universidades o centros educativos destacados (aparte de la mencionada)\n"
                              "* **Cultura**: Principales eventos culturales, museos o tradiciones\n"
                              "* **Transporte**: Infraestructura y conectividad\n"
                              "* **Costo de vida**: Comparativa con otras ciudades del país\n"
                              "* **Seguridad**: Nivel general de seguridad para estudiantes\n"
                              "* **Dato curioso**: Un dato único o interesante sobre la ciudad\n\n"
                              "Usa formato markdown para la negrita (con ** alrededor del texto). Mantén cada bullet breve pero informativo."
                }
            ],
            'max_tokens': 600,
            'temperature': 0.7
        }
        
        response = requests.post(
            'https://api.openai.com/v1/chat/completions',
            headers=headers,
            json=payload
        )
        
        return response.json()['choices'][0]['message']['content'].strip()
    except Exception as error:
        print(f"Error generating city description: {error}")
        return f"* No se pudo generar una descripción detallada para {ciudad}."

def generate_stats(universidad, programas):
    """
    Generate academic stats for a university using OpenAI
    
    Args:
        universidad: University name
        programas: Programs with research lines
        
    Returns:
        Generated stats
    """
    try:
        programas_texto = '\n\n'.join([f"{p['nombre']}: {'. '.join(p['lineas_investigacion'])}" for p in programas])
        
        headers = {
            'Authorization': f"Bearer {OPENAI_API_KEY}",
            'Content-Type': 'application/json'
        }
        
        payload = {
            'model': 'gpt-3.5-turbo',
            'messages': [
                {
                    'role': 'system',
                    'content': 'Eres un analista de datos académicos que evalúa programas de doctorado y genera estadísticas cualitativas.'
                },
                {
                    'role': 'user',
                    'content': f"Basándote en la siguiente información de programas de doctorado de {universidad}, genera 5 métricas numéricas en escala del 1 al 10 para evaluar: \n"
                              "1. Innovación: cuán innovadores son los temas de investigación\n"
                              "2. Interdisciplinariedad: nivel de colaboración entre disciplinas\n"
                              "3. Impacto potencial: posible impacto en la sociedad/industria\n"
                              "4. Competitividad internacional: posicionamiento internacional\n"
                              "5. Aplicabilidad: orientación práctica vs. teórica\n\n"
                              f"Programas y líneas de investigación:\n{programas_texto}\n\n"
                              "Responde SOLO con un objeto JSON con este formato exacto:\n"
                              "{\"innovacion\": N, \"interdisciplinariedad\": N, \"impacto\": N, \"internacional\": N, \"aplicabilidad\": N}\n"
                              "donde N es un número del 1 al 10."
                }
            ],
            'max_tokens': 150,
            'temperature': 0.7
        }
        
        response = requests.post(
            'https://api.openai.com/v1/chat/completions',
            headers=headers,
            json=payload
        )
        
        content = response.json()['choices'][0]['message']['content'].strip()
        try:
            return json.loads(content)
        except Exception as e:
            print(f"Error parsing stats JSON: {e}")
            return {
                'innovacion': 5,
                'interdisciplinariedad': 5,
                'impacto': 5,
                'internacional': 5,
                'aplicabilidad': 5
            }
    except Exception as error:
        print(f"Error generating stats: {error}")
        return {
            'innovacion': 5,
            'interdisciplinariedad': 5,
            'impacto': 5,
            'internacional': 5,
            'aplicabilidad': 5
        }

def get_city_metrics(ciudad):
    """
    Get city-specific metrics using OpenAI
    
    Args:
        ciudad: City name
        
    Returns:
        City metrics
    """
    try:
        metrics = {}
        
        # Determine if the city is in Portugal
        portuguese_cities = ['Lisboa', 'Porto', 'Coimbra', 'Braga', 'Aveiro', 'Faro']
        is_portuguese_city = (ciudad in portuguese_cities or 
                             any(ciudad.find(pc) >= 0 for pc in portuguese_cities) or
                             'Portugal' in ciudad)
        
        # Country-specific assistant content
        country_context = ("Eres un asistente especializado en economía y datos de ciudades portuguesas."
                          if is_portuguese_city else
                          "Eres un asistente especializado en economía y datos de ciudades españolas.")
        
        headers = {
            'Authorization': f"Bearer {OPENAI_API_KEY}",
            'Content-Type': 'application/json'
        }
        
        # Costo de vida - Escala invertida de 1-10 (más bajo costo = 10 puntos, más alto costo = 1 punto)
        costo_vida_payload = {
            'model': 'gpt-3.5-turbo',
            'messages': [
                {
                    'role': 'system',
                    'content': country_context
                },
                {
                    'role': 'user',
                    'content': f"Evalúa el costo de vida en {ciudad} (sin incluir alquiler) en una escala INVERTIDA del 1 al 10, donde:\n"
                             "- 10 puntos = costo de vida muy bajo (más económico)\n"
                             "- 1 punto = costo de vida muy alto (más caro)\n\n"
                             "Primero compara con Ciudad de México como referencia y luego asigna una puntuación.\n\n"
                             "Responde con:\n"
                             "1. Un número entero del 1 al 10 (recuerda: mayor puntuación = menor costo de vida)\n"
                             "2. Un comentario que explique el porcentaje aproximado de diferencia con Ciudad de México\n"
                             "3. Menciona 2-3 factores específicos que afectan el costo de vida en esta ciudad\n\n"
                             "Ejemplo de respuesta: \"8. El costo de vida en [ciudad] es aproximadamente un 30% menor que en Ciudad de México. Destacan los bajos precios en transporte público y alimentación, aunque los servicios básicos son relativamente costosos.\""
                }
            ],
            'max_tokens': 200,
            'temperature': 0.5
        }
        
        response = requests.post(
            'https://api.openai.com/v1/chat/completions',
            headers=headers,
            json=costo_vida_payload
        )
        
        costo_vida_text = response.json()['choices'][0]['message']['content'].strip()
        import re
        costo_vida_match = re.search(r'\b([1-9]|10)\b', costo_vida_text)
        metrics['costo_vida'] = int(costo_vida_match.group(0)) if costo_vida_match else 5
        metrics['costo_vida_comentario'] = costo_vida_text
        
        # Reference city for distance - Madrid for Spain, Lisboa for Portugal
        reference_city = "Lisboa" if is_portuguese_city else "Madrid"
        
        # Distancia a la ciudad de referencia
        distancia_payload = {
            'model': 'gpt-3.5-turbo',
            'messages': [
                {
                    'role': 'system',
                    'content': 'Eres un asistente especializado en geografía y cálculo de distancias.'
                },
                {
                    'role': 'user',
                    'content': f"Calcula la distancia aérea en kilómetros entre {ciudad} y {reference_city}, basándote en coordenadas. Devuélvela como número entero."
                }
            ],
            'max_tokens': 50,
            'temperature': 0.5
        }
        
        response = requests.post(
            'https://api.openai.com/v1/chat/completions',
            headers=headers,
            json=distancia_payload
        )
        
        distancia_text = response.json()['choices'][0]['message']['content'].strip()
        distancia_match = re.search(r'\b([0-9]{1,4})\b', distancia_text)
        distancia_value = int(distancia_match.group(0)) if distancia_match else 300
        
        # Store appropriate distance field based on country
        if is_portuguese_city:
            metrics['distancia_a_lisboa_km'] = distancia_value
            metrics['distancia_a_madrid_km'] = None  # Not applicable for Portuguese cities
        else:
            metrics['distancia_a_madrid_km'] = distancia_value
            metrics['distancia_a_lisboa_km'] = None  # Not applicable for Spanish cities
        
        # Store the reference city used
        metrics['ciudad_referencia'] = reference_city
        
        # Calidad del servicio médico
        medico_system_content = ("Eres un asistente especializado en sistemas sanitarios portugueses."
                             if is_portuguese_city else
                             "Eres un asistente especializado en sistemas sanitarios españoles.")
        
        medico_payload = {
            'model': 'gpt-3.5-turbo',
            'messages': [
                {
                    'role': 'system',
                    'content': medico_system_content
                },
                {
                    'role': 'user',
                    'content': f"En una escala de 0 a 10, ¿qué puntuación le das a la calidad sanitaria en {ciudad}? Proporciónanos solo el número y, opcionalmente, dos frases de justificación."
                }
            ],
            'max_tokens': 100,
            'temperature': 0.5
        }
        
        response = requests.post(
            'https://api.openai.com/v1/chat/completions',
            headers=headers,
            json=medico_payload
        )
        
        medico_text = response.json()['choices'][0]['message']['content'].strip()
        medico_match = re.search(r'\b([0-9]|10)\b', medico_text)
        metrics['calidad_servicio_medico'] = int(medico_match.group(0)) if medico_match else 8
        metrics['calidad_servicio_medico_comentario'] = medico_text
        
        # Calidad del transporte público
        transporte_payload = {
            'model': 'gpt-3.5-turbo',
            'messages': [
                {
                    'role': 'system',
                    'content': 'Eres un asistente especializado en infraestructura de transporte urbano.'
                },
                {
                    'role': 'user',
                    'content': f"En una escala de 0 a 10, ¿cómo calificarías la calidad de transporte público en {ciudad}? Responde con un número y una breve justificación."
                }
            ],
            'max_tokens': 100,
            'temperature': 0.5
        }
        
        response = requests.post(
            'https://api.openai.com/v1/chat/completions',
            headers=headers,
            json=transporte_payload
        )
        
        transporte_text = response.json()['choices'][0]['message']['content'].strip()
        transporte_match = re.search(r'\b([0-9]|10)\b', transporte_text)
        metrics['calidad_transporte'] = int(transporte_match.group(0)) if transporte_match else 7
        metrics['calidad_transporte_comentario'] = transporte_text
        
        # Calidad del aire
        aire_payload = {
            'model': 'gpt-3.5-turbo',
            'messages': [
                {
                    'role': 'system',
                    'content': 'Eres un asistente especializado en calidad medioambiental urbana.'
                },
                {
                    'role': 'user',
                    'content': f"En una escala de 0 a 10, ¿cómo calificarías la calidad del aire en {ciudad}? Solo el número y, opcionalmente, una frase justificando."
                }
            ],
            'max_tokens': 100,
            'temperature': 0.5
        }
        
        response = requests.post(
            'https://api.openai.com/v1/chat/completions',
            headers=headers,
            json=aire_payload
        )
        
        aire_text = response.json()['choices'][0]['message']['content'].strip()
        aire_match = re.search(r'\b([0-9]|10)\b', aire_text)
        metrics['calidad_aire'] = int(aire_match.group(0)) if aire_match else 7
        metrics['calidad_aire_comentario'] = aire_text
        
        return metrics
    except Exception as error:
        print(f"Error getting city metrics for {ciudad}: {error}")
        return {
            'costo_vida': 7,
            'distancia_a_madrid_km': 300,
            'calidad_servicio_medico': 8,
            'calidad_transporte': 7,
            'calidad_aire': 7
        }

## Initialize All Records

This function enriches all records in the database with the required fields.

In [ ]:
def enrich_data(collection_name='programas', create_backup=True):
    """
    Enrich data with coordinates, summaries, stats, and city metrics
    
    Args:
        collection_name: Collection name to enrich
        create_backup: Whether to create a backup before enrichment
        
    Returns:
        Enrichment results
    """
    try:
        # Create backup if requested
        if create_backup:
            backup_path = backup_collection(collection_name)
            if not backup_path:
                print('Backup not created. Continuing with enrichment...')
            else:
                print(f'Backup created at: {backup_path}')
        
        db = connect_to_mongo()
        collection = db[collection_name]
        
        # Group programs by university to process them together
        pipeline = [
            {
                '$group': {
                    '_id': {
                        'universidad': "$universidad",
                        'ciudad': "$ciudad"
                    },
                    'programas': {
                        '$push': {
                            '_id': "$_id",
                            'programa': "$programa",
                            'linea_investigacion': "$linea_investigacion",
                            'url': "$url"
                        }
                    }
                }
            }
        ]
        
        universidades = list(collection.aggregate(pipeline))
        
        total_updated = 0
        total_universities = len(universidades)
        processed_universities = 0
        
        print(f"Starting enrichment for {total_universities} universities...")
        
        # Process each university
        for uni in universidades:
            universidad = uni['_id']['universidad']
            ciudad = uni['_id']['ciudad']
            processed_universities += 1
            
            print(f"Processing {processed_universities}/{total_universities}: {universidad} ({ciudad})")
            
            # Get coordinates for the city
            coords = get_coordinates(ciudad)
            print(f"Coordinates for {ciudad}: {coords}")
            
            # Get city metrics (only once per city)
            print(f"Getting metrics for {ciudad}...")
            city_metrics = get_city_metrics(ciudad)
            print(f"City metrics obtained for {ciudad}")
            
            # Generate city description
            print(f"Generating city description for {ciudad}...")
            city_description = generate_city_description(ciudad)
            print(f"City description generated for {ciudad}")
            
            # Process each program to add summaries
            for prog in uni['programas']:
                # Generate summary
                print(f"Generating summary for: {prog['programa']}")
                resumen = generate_summary(prog['linea_investigacion'])
                
                # Update program with summary
                collection.update_one(
                    {'_id': ObjectId(prog['_id'])},
                    {'$set': {'resumen': resumen}}
                )
                
                total_updated += 1
                
                # Update coordinates and city metrics for all programs of this university
                update_data = {}
                if coords:
                    update_data['coords'] = coords
                if city_metrics:
                    update_data['city_metrics'] = city_metrics
                if city_description:
                    update_data['city_description'] = city_description
                
                if update_data:
                    collection.update_one(
                        {'_id': ObjectId(prog['_id'])},
                        {'$set': update_data}
                    )
            
            # Generate stats for the university
            print(f"Generating stats for {universidad}...")
            processed_programas = [
                {
                    'nombre': p['programa'],
                    'lineas_investigacion': p['linea_investigacion'].split('\n\n')
                } for p in uni['programas']
            ]
            
            stats = generate_stats(universidad, processed_programas)
            print(f"Stats generated for {universidad}")
            
            # Generate university summary
            print(f"Generating university summary for {universidad} ({ciudad})...")
            university_summary = generate_university_summary(universidad, ciudad)
            print(f"University summary generated for {universidad}")
            
            # Generate university description
            print(f"Generating university description for {universidad} ({ciudad})...")
            university_description = generate_university_description(universidad, ciudad)
            print(f"University description generated for {universidad}")
            
            # Update all programs with the university stats, summary, description, and timestamp
            ultimo_enriquecimiento = datetime.now().isoformat()
            collection.update_many(
                {'universidad': universidad},
                {
                    '$set': {
                        'stats': stats,
                        'university_summary': university_summary,
                        'university_description': university_description,
                        'ultimo_enriquecimiento': ultimo_enriquecimiento
                    }
                }
            )
        
        print(f"Data enrichment completed: {processed_universities} universities processed, {total_updated} programs updated.")
        
        return {
            'message': 'Data enrichment completed',
            'updated': total_updated,
            'universities': processed_universities,
            'timestamp': datetime.now().isoformat()
        }
    except Exception as error:
        print(f"Error enriching data: {error}")
        raise error

## Initialize Only Missing Records

This function enriches only records that are missing required fields.

In [ ]:
def enrich_missing_data(collection_name='programas', create_backup=True):
    """
    Enrich only records with missing fields
    
    Args:
        collection_name: Collection name to enrich
        create_backup: Whether to create a backup before enrichment
        
    Returns:
        Enrichment results
    """
    try:
        # Create backup if requested
        if create_backup:
            backup_path = backup_collection(collection_name)
            if not backup_path:
                print('Backup not created. Continuing with enrichment...')
            else:
                print(f'Backup created at: {backup_path}')
        
        db = connect_to_mongo()
        collection = db[collection_name]
        
        # Required fields that should be present in each record
        required_fields = [
            'resumen',
            'coords',
            'stats',
            'city_metrics',
            'university_summary',
            'ultimo_enriquecimiento',
            'city_description',      # New required field
            'university_description' # New required field
        ]
        
        # Find records missing any required field
        query = {
            "$or": [{field: {"$exists": False}} for field in required_fields]
        }
        
        # Count records needing update
        total_missing = collection.count_documents(query)
        print(f"Found {total_missing} records needing updates")
        
        if total_missing == 0:
            return {
                'message': 'No records need updating',
                'updated': 0,
                'universities': 0,
                'timestamp': datetime.now().isoformat()
            }
        
        # Group incomplete programs by university to process them together
        pipeline = [
            {"$match": query},
            {
                '$group': {
                    '_id': {
                        'universidad': "$universidad",
                        'ciudad': "$ciudad"
                    },
                    'programas': {
                        '$push': {
                            '_id': "$_id",
                            'programa': "$programa",
                            'linea_investigacion': "$linea_investigacion",
                            'url': "$url",
                            'resumen': {"$ifNull": ["$resumen", None]},
                            'coords': {"$ifNull": ["$coords", None]},
                            'stats': {"$ifNull": ["$stats", None]},
                            'city_metrics': {"$ifNull": ["$city_metrics", None]},
                            'university_summary': {"$ifNull": ["$university_summary", None]},
                            'city_description': {"$ifNull": ["$city_description", None]},
                            'university_description': {"$ifNull": ["$university_description", None]}
                        }
                    }
                }
            }
        ]
        
        universidades = list(collection.aggregate(pipeline))
        
        total_updated = 0
        total_universities = len(universidades)
        processed_universities = 0
        
        print(f"Starting enrichment for {total_universities} universities with incomplete records...")
        
        # Process each university
        for uni in universidades:
            universidad = uni['_id']['universidad']
            ciudad = uni['_id']['ciudad']
            processed_universities += 1
            
            print(f"Processing {processed_universities}/{total_universities}: {universidad} ({ciudad})")
            
            # Check if we need to get coordinates
            coords = None
            need_coords = any(prog['coords'] is None for prog in uni['programas'])
            if need_coords:
                coords = get_coordinates(ciudad)
                print(f"Coordinates for {ciudad}: {coords}")
            
            # Check if we need to get city metrics
            city_metrics = None
            need_city_metrics = any(prog['city_metrics'] is None for prog in uni['programas'])
            if need_city_metrics:
                print(f"Getting metrics for {ciudad}...")
                city_metrics = get_city_metrics(ciudad)
                print(f"City metrics obtained for {ciudad}")
            
            # Check if we need to generate city description
            city_description = None
            need_city_description = any(prog['city_description'] is None for prog in uni['programas'])
            if need_city_description:
                print(f"Generating city description for {ciudad}...")
                city_description = generate_city_description(ciudad)
                print(f"City description generated for {ciudad}")
            
            # Stats and university_summary need to be processed once per university
            stats = None
            need_stats = any(prog['stats'] is None for prog in uni['programas'])
            if need_stats:
                print(f"Generating stats for {universidad}...")
                processed_programas = [
                    {
                        'nombre': p['programa'],
                        'lineas_investigacion': p['linea_investigacion'].split('\n\n')
                    } for p in uni['programas']
                ]
                stats = generate_stats(universidad, processed_programas)
                print(f"Stats generated for {universidad}")
            
            university_summary = None
            need_uni_summary = any(prog['university_summary'] is None for prog in uni['programas'])
            if need_uni_summary:
                print(f"Generating university summary for {universidad} ({ciudad})...")
                university_summary = generate_university_summary(universidad, ciudad)
                print(f"University summary generated for {universidad}")
            
            # Check if we need to generate university description
            university_description = None
            need_uni_description = any(prog['university_description'] is None for prog in uni['programas'])
            if need_uni_description:
                print(f"Generating university description for {universidad} ({ciudad})...")
                university_description = generate_university_description(universidad, ciudad)
                print(f"University description generated for {universidad}")
            
            # Process each program to add summaries
            for prog in uni['programas']:
                update_data = {}
                
                # Check if resumen is needed
                if prog['resumen'] is None and prog['linea_investigacion']:
                    print(f"Generating summary for: {prog['programa']}")
                    resumen = generate_summary(prog['linea_investigacion'])
                    update_data['resumen'] = resumen
                
                # Add other missing fields
                if coords and prog['coords'] is None:
                    update_data['coords'] = coords
                
                if city_metrics and prog['city_metrics'] is None:
                    update_data['city_metrics'] = city_metrics
                
                if stats and prog['stats'] is None:
                    update_data['stats'] = stats
                
                if university_summary and prog['university_summary'] is None:
                    update_data['university_summary'] = university_summary
                
                # Add new description fields
                if city_description and prog['city_description'] is None:
                    update_data['city_description'] = city_description
                
                if university_description and prog['university_description'] is None:
                    update_data['university_description'] = university_description
                
                # Add timestamp
                update_data['ultimo_enriquecimiento'] = datetime.now().isoformat()
                
                # Update if any fields need to be updated
                if update_data:
                    collection.update_one(
                        {'_id': ObjectId(prog['_id'])},
                        {'$set': update_data}
                    )
                    total_updated += 1
        
        print(f"Data enrichment completed: {processed_universities} universities processed, {total_updated} programs updated.")
        
        return {
            'message': 'Data enrichment completed',
            'updated': total_updated,
            'universities': processed_universities,
            'timestamp': datetime.now().isoformat()
        }
    except Exception as error:
        print(f"Error enriching data: {error}")
        raise error

## Initialize All Records

⚠️ **Warning**: This will re-initialize ALL records in the database. Use with caution.

In [None]:
# Uncomment and run this cell to initialize all records
# results = enrich_data()
# print(f"Enrichment completed: {results['universities']} universities and {results['updated']} programs processed.")

## Initialize Only Missing Records

This will only initialize records that are missing required fields.

In [12]:
# Uncomment and run this cell to initialize only missing records
results = enrich_missing_data()
print(f"Enrichment completed: {results['universities']} universities and {results['updated']} programs processed.")

Connected to MongoDB
Created backup directory: /mnt/c/Users/HG_Co/OneDrive/Documents/Github/graduate-programs-evaluator/backups
Backup successful: 'programas' → '/mnt/c/Users/HG_Co/OneDrive/Documents/Github/graduate-programs-evaluator/backups\backup_programas_2025-06-06T22-29-37-134779.json' (167 documents)
Backup created at: /mnt/c/Users/HG_Co/OneDrive/Documents/Github/graduate-programs-evaluator/backups\backup_programas_2025-06-06T22-29-37-134779.json
Connected to MongoDB
Found 167 records needing updates
Starting enrichment for 47 universities with incomplete records...
Processing 1/47: Universidad de Castilla-La Mancha (Castilla-La Mancha)
Coordinates for Castilla-La Mancha: {'lat': 39.9908335, 'lon': -2.8052827}
Getting metrics for Castilla-La Mancha...
City metrics obtained for Castilla-La Mancha
Generating stats for Universidad de Castilla-La Mancha...
Stats generated for Universidad de Castilla-La Mancha
Generating university summary for Universidad de Castilla-La Mancha (Casti

## Check Database Status After Initialization

Verify that all records have been properly initialized.

In [None]:
# Re-check missing fields after initialization
missing_data_after = check_missing_fields()

print(f"Records still missing any required field: {missing_data_after['total_missing_any']}")
display(missing_data_after['statistics'])

if missing_data_after['total_missing_any'] == 0:
    print("✅ All records have been successfully initialized!")
else:
    print("⚠️ Some records still need initialization.")