In [113]:
# TODO: CREATE BlanchotWork OBJECT TO STANDARDIZE DATA STRUCTURE OF REPSECTIVE DATABASES (OA, CR, HAL, PP)
import json
import pandas as pd
from typing import List, Optional
from pydantic import BaseModel, HttpUrl, ValidationError

In [114]:
class Author(BaseModel):
    """A standardized representation of an author."""
    full_name: str
    given_name: Optional[str] = None
    family_name: Optional[str] = None

class BlanchotWork(BaseModel):
    """
    A comprehensive, standardized model for a single bibliographic record,
    designed to hold rich data from multiple sources.
    """
    # Core Identifiers
    doi: Optional[str] = None
    title: str
    
    # Author Details
    authors: List[Author]
    
    # Publication Details
    year: Optional[int] = None
    publication_date: Optional[str] = None
    journal_name: Optional[str] = None
    publisher: Optional[str] = None
    work_type: Optional[str] = None
    
    # Content and Context
    abstract: Optional[str] = None
    subjects: List[str] = []
    
    # Metrics and Links
    citation_count: Optional[int] = None
    source_url: HttpUrl # A direct link to the record in its original database
    
    # Metadata
    source_db: str # To track where it came from

In [115]:
def from_openalex_to_blanchotwork(work_data: dict) -> dict:
    """Translates a raw OpenAlex dictionary into our standard format."""
    authors = []
    for authorship in work_data.get('authorships', []):
        if author_info := authorship.get('author'):
            authors.append(Author(full_name=author_info.get('display_name', '')))

    subjects = [concept.get('display_name') for concept in work_data.get('concepts', []) if concept]
    journal = work_data.get('primary_location', {}).get('source', {}).get('display_name')

    return {
        'doi': work_data.get('doi'),
        'title': work_data.get('title', 'No Title Provided'),
        'authors': authors,
        'year': work_data.get('publication_year'),
        'publication_date': work_data.get('publication_date'),
        'journal_name': journal,
        'publisher': work_data.get('publisher'),
        'work_type': work_data.get('type'),
        'subjects': subjects,
        'source_url': work_data.get('id'),
        'source_db': 'OpenAlex'
    }

def from_crossref_to_blanchotwork(work_data: dict) -> dict:
    """Translates a raw Crossref dictionary into our standard format."""
    authors = []
    author_list = work_data.get('author') or [] 
    for author_info in author_list:
        given = author_info.get('given', '')
        family = author_info.get('family', '')
        authors.append(Author(full_name=f"{given} {family}".strip(), given_name=given, family_name=family))

    year = None
    if published := (work_data.get('published-print') or work_data.get('published-online')):
        if date_parts := published.get('date-parts', [[]]):
            year = date_parts[0][0] if date_parts[0] else None

    # --- ✅ FIX IS HERE ---
    # Safely extract the journal name by checking its type first
    journal_name = None
    container_title = work_data.get('container-title')
    if isinstance(container_title, list) and container_title:
        journal_name = container_title[0]
    # ----------------------

    doi = work_data.get('DOI')
    source_url = work_data.get('URL')
    if not source_url and doi:
        source_url = f"https://doi.org/{doi}"
    
    return {
        'doi': doi,
        'title': (work_data.get('title') or ['No Title Provided'])[0],
        'authors': authors,
        'year': year,
        'journal_name': journal_name, # Use the safely extracted name
        'publisher': work_data.get('publisher'),
        'work_type': work_data.get('type'),
        'subjects': work_data.get('subject', []),
        'source_url': source_url,
        'source_db': 'Crossref'
    }

def from_hal_to_blanchotwork(work_data: dict) -> dict:
    """Translates a raw HAL dictionary into our standard format."""
    authors = [Author(full_name=name) for name in work_data.get('authFullName_s', [])]

    return {
        'doi': work_data.get('doiId_s'), # HAL often stores DOI in this field
        'title': (work_data.get('title_s') or ['No Title Provided'])[0],
        'authors': authors,
        'year': work_data.get('publicationDateY_i'),
        'journal_name': work_data.get('journalTitle_s'),
        'work_type': work_data.get('docType_s'),
        'source_url': work_data.get('uri_s'),
        'source_db': 'HAL'
    }

In [116]:
with open('openalex/openalex_blanchot.json', mode='rt') as f:
    openalex_json = json.load(f)
    
with open('crossref/crossref_blanchot_filtered.json', mode='rt') as f:
    crossref_json = json.load(f)

with open('hal/hal_blanchot_data.json', mode='rt') as f:
    hal_json = json.load(f)

In [117]:
# TODO: COMPLETE TRANSLATOR FROM RESPECTIVE DATABASES TO BlanchotWork OBJECTS.

master_list: List[BlanchotWork] = []
failed_records = []

# --- Process OpenAlex Data ---
for work in openalex_json:
    try:
        # ✅ FIX: Check if the item is a string and parse it if necessary
        if isinstance(work, str):
            work_dict = json.loads(work)
        else:
            work_dict = work

        standardized_data = from_openalex_to_blanchotwork(work_dict)
        master_list.append(BlanchotWork.model_validate(standardized_data))
        
    except (ValidationError, json.JSONDecodeError) as e:
        # This now catches both Pydantic and JSON parsing errors
        failed_records.append({'source': 'OpenAlex', 'id': work.get('id') if isinstance(work, dict) else 'Unknown', 'error': str(e)})

# --- Process Crossref Data ---
for work in crossref_json:
    standardized_data = from_crossref_to_blanchotwork(work)
    try:
        master_list.append(BlanchotWork.model_validate(standardized_data))
    except ValidationError as e:
        failed_records.append({'source': 'Crossref', 'id': work.get('DOI'), 'error': str(e)})
        
# --- Process HAL Data ---
for work in hal_json:
    standardized_data = from_hal_to_blanchotwork(work)
    try:
        master_list.append(BlanchotWork.model_validate(standardized_data))
    except ValidationError as e:
        failed_records.append({'source': 'HAL', 'id': work.get('uri_s'), 'error': str(e)})

# --- Final Step: De-duplicate the master list ---
unique_works = {}
for work in master_list:
    if work.doi: # Use DOI as the primary key for de-duplication
        unique_works[work.doi.lower()] = work

final_master_list = list(unique_works.values())

print(f"✅ Synthesis complete. {len(final_master_list)} unique works.")

✅ Synthesis complete. 582 unique works.


In [118]:
records_to_save = [work.model_dump(mode='json') for work in final_master_list]
with open('blanchot_master_list.json', 'w', encoding='utf-8') as f:
    json.dump(records_to_save, f, indent=2)

In [119]:
import json
import pandas as pd

# Assume 'final_master_list' is your list of validated Pydantic 'BlanchotWork' objects.

# --- 1. Create the simplified list for quick review ---
# This step is done only once.
print("Creating simplified list for review...")
quick_review_list = [
    {
        "year": work.year,
        "title": work.title,
        "authors": [author.full_name for author in work.authors],
        "source_url": str(work.source_url)
    }
    for work in final_master_list
]
print(f"Created simplified list with {len(quick_review_list)} records.")


# --- 2. Save the list to a JSON file ---
# This preserves the list of authors within the JSON structure.
json_filename = 'blanchot_master_list_quick_review.json'
print(f"\nSaving data to JSON file: {json_filename}...")
with open(json_filename, 'w', encoding='utf-8') as f:
    json.dump(quick_review_list, f, indent=2)
print(f"✅ Successfully saved JSON file.")


# --- 3. Flatten the list and save it to a CSV file ---
# This converts the list of authors into a single string for the CSV.
print(f"\nFlattening data for CSV export...")
records_for_csv = []
for record in quick_review_list:
    records_for_csv.append({
        'year': record['year'],
        'title': record['title'],
        'authors': ', '.join(record['authors']), # Join the list into a single string
        'source_url': record['source_url']
    })

df = pd.DataFrame(records_for_csv)
csv_filename = 'blanchot_master_list_quick_review.csv'
df.to_csv(csv_filename, index=False, encoding='utf-8')
print(f"✅ Successfully saved flattened data to CSV file: {csv_filename}")

Creating simplified list for review...
Created simplified list with 582 records.

Saving data to JSON file: blanchot_master_list_quick_review.json...
✅ Successfully saved JSON file.

Flattening data for CSV export...
✅ Successfully saved flattened data to CSV file: blanchot_master_list_quick_review.csv


In [120]:
import json
import pandas as pd

print("Creating simplified list for review...")
quick_review_list = [
    {
        "year": work.year,
        "title": work.title,
        "authors": [author.full_name for author in work.authors],
        "source_url": str(work.source_url)
    }
    for work in final_master_list
]
print(f"Created simplified list with {len(quick_review_list)} records.")

json_filename = 'blanchot_master_list_quick_review.json'
print(f"\nSaving data to JSON file: {json_filename}...")
with open(json_filename, 'w', encoding='utf-8') as f:
    json.dump(quick_review_list, f, indent=2)
print(f"✅ Successfully saved JSON file.")

print(f"\nConsolidating book chapters for CSV export...")

df = pd.DataFrame(quick_review_list)

df['book_title'] = df['title'].str.split(':').str[0].str.strip()

consolidated_df = df.groupby(['book_title', 'year']).agg(
    authors=('authors', 'first'),
    source_url=('source_url', 'first'),
    chapters_found=('title', 'count')
).reset_index()

def create_final_title(row):
    if row['chapters_found'] > 1:
        return f"{row['book_title']} ({row['chapters_found']} chapters)"
    else:
        return row['book_title']

consolidated_df['title'] = consolidated_df.apply(create_final_title, axis=1)

consolidated_df['authors'] = consolidated_df['authors'].apply(lambda authors: ', '.join(authors))

final_df = consolidated_df[['year', 'title', 'authors', 'source_url', 'chapters_found']]
final_df = final_df.sort_values(by='year', ascending=False)

csv_filename = 'blanchot_master_list_consolidated.csv'
final_df.to_csv(csv_filename, index=False, encoding='utf-8')
print(f"✅ Successfully saved consolidated data to CSV file: {csv_filename}")

Creating simplified list for review...
Created simplified list with 582 records.

Saving data to JSON file: blanchot_master_list_quick_review.json...
✅ Successfully saved JSON file.

Consolidating book chapters for CSV export...
✅ Successfully saved consolidated data to CSV file: blanchot_master_list_consolidated.csv


In [124]:
import json
import pandas as pd
from typing import List, Optional, Dict, Any
from pydantic import BaseModel, HttpUrl, ValidationError

# --- (Your Pydantic models and translator functions go here, unchanged) ---

# --- MAIN SCRIPT WORKFLOW with DIAGNOSTICS ---

# Load source files
with open('openalex/openalex_blanchot.json', mode='rt') as f: openalex_json = json.load(f)
with open('crossref/crossref_blanchot_filtered.json', mode='rt') as f: crossref_json = json.load(f)
with open('hal/hal_blanchot_data.json', mode='rt') as f: hal_json = json.load(f)

print("--- 📊 DIAGNOSTICS: Initial Record Count ---")
print(f"Loaded {len(openalex_json)} records from OpenAlex.")
print(f"Loaded {len(crossref_json)} records from Crossref.")
print(f"Loaded {len(hal_json)} records from HAL.")
print(f"Total raw records: {len(openalex_json) + len(crossref_json) + len(hal_json)}")
print("-" * 20)

master_list: List[BlanchotWork] = []
failed_records = []

# (Processing loops for OpenAlex, Crossref, and HAL are unchanged)
for source_data, translator, source_name in [(openalex_json, from_openalex_to_blanchotwork, "OpenAlex"), (crossref_json, from_crossref_to_blanchotwork, "Crossref"), (hal_json, from_hal_to_blanchotwork, "HAL")]:
    for work in source_data:
        try:
            work_dict = json.loads(work) if isinstance(work, str) else work
            standardized_data = translator(work_dict)
            master_list.append(BlanchotWork.model_validate(standardized_data))
        except (ValidationError, json.JSONDecodeError):
            continue

print(f"--- 📊 DIAGNOSTICS: After Validation ---")
print(f"Total records that passed validation: {len(master_list)}")
print("-" * 20)

# --- CONSOLIDATE CHAPTERS ---
print(f"Consolidating book chapters from {len(master_list)} total records...")
df = pd.DataFrame([work.model_dump() for work in master_list])

def get_grouping_id(row):
    if pd.notna(row.get('relation')) and 'is-part-of' in row['relation']:
        if parent_doi := row['relation']['is-part-of'][0].get('id'):
            return f"DOI:{parent_doi}"
    title_prefix = str(row['title']).split(':')[0].strip()
    return f"TITLE:{title_prefix}_{row['year']}"

df['grouping_id'] = df.apply(get_grouping_id, axis=1)

consolidated_df = df.groupby('grouping_id').agg(
    doi=('doi', 'first'), year=('year', 'first'), authors=('authors', 'first'),
    publisher=('publisher', 'first'), work_type=('work_type', 'first'),
    source_url=('source_url', 'first'), source_db=('source_db', 'first'),
    title=('title', lambda x: min(x, key=len)),
    chapters_found=('title', 'count')
).reset_index()

print(f"--- 📊 DIAGNOSTICS: After Chapter Consolidation ---")
print(f"Consolidation resulted in {len(consolidated_df)} unique works/books.")
print("-" * 20)

# --- FINAL DE-DUPLICATION and SAVE ---

# This line is correct
final_df = consolidated_df.dropna(subset=['doi']).drop_duplicates(subset=['doi'], keep='first')

print(f"--- 📊 DIAGNOSTICS: After Final De-duplication by DOI ---")
print(f"Final master list contains {len(final_df)} unique works.")
print("-" * 20)

# --- ✅ FIX: Convert columns with special objects to strings before saving ---
# This ensures all data is in a format that the json library can handle.
final_df['source_url'] = final_df['source_url'].astype(str)
# It's also good practice to convert the 'authors' column, as it contains Author objects
final_df['authors'] = final_df['authors'].astype(str)


# Now, convert the clean DataFrame to a list of dictionaries
records_to_save = final_df.to_dict('records')

# Save the final list to your JSON file
with open('blanchot_master_list.json', 'w', encoding='utf-8') as f:
    # This will now work because all special objects have been converted to strings
    json.dump(records_to_save, f, indent=2)

print(f"\n✅ Synthesis complete. Saved a master list with {len(records_to_save)} unique, consolidated works.")

--- 📊 DIAGNOSTICS: Initial Record Count ---
Loaded 4067 records from OpenAlex.
Loaded 582 records from Crossref.
Loaded 180 records from HAL.
Total raw records: 4829
--------------------
--- 📊 DIAGNOSTICS: After Validation ---
Total records that passed validation: 237
--------------------
Consolidating book chapters from 237 total records...
--- 📊 DIAGNOSTICS: After Chapter Consolidation ---
Consolidation resulted in 230 unique works/books.
--------------------
--- 📊 DIAGNOSTICS: After Final De-duplication by DOI ---
Final master list contains 51 unique works.
--------------------

✅ Synthesis complete. Saved a master list with 51 unique, consolidated works.
