In [3]:
# TODO:RUN SCRIPT WEEKLY W/ GITHUB BOT
import json
import pandas as pd
from tqdm import tqdm
from typing import List, Optional, Dict, Any
from pydantic import BaseModel, HttpUrl, Field, ValidationError
from crossref.restful import Works



In [4]:
class DateParts(BaseModel):
    date_parts: Optional[List[List[Optional[int]]]] = Field(None, alias='date-parts')

class Author(BaseModel):
    given: Optional[str] = None
    family: Optional[str] = None
    sequence: Optional[str] = None
    affiliation: Optional[List[Dict[str, Any]]] = None

class CrossrefWorkModel(BaseModel):
    DOI: str
    title: List[str]
    author: Optional[List[Author]] = None
    publisher: str
    type: str
    published_print: Optional[DateParts] = Field(None, alias='published-print')
    
    class Config:
        extra = 'allow' 
        populate_by_name = True

In [5]:
works_api = Works()

works_query = works_api.query(bibliographic='Blanchot').filter(
    from_pub_date='1998'
).sort('published').order('asc')

validated_records = []
failed_records = []

try:
    for work_data in tqdm(works_query, total=works_query.count(), desc="Downloading"):
        try:
            validated_work = CrossrefWorkModel.model_validate(work_data)
            validated_records.append(validated_work)
        except ValidationError as e:
            failed_records.append({'doi': work_data.get('DOI'), 'error': str(e)})
except Exception as e:
    print(f"An unexpected error occurred during download: {e}")


print(f"\nDownload complete.")
print(f"Total validated records: {len(validated_records)}")
if failed_records:
    print(f"Total records that failed validation: {len(failed_records)}")


unique_records = []
seen_dois = set()
duplicate_log = []

for work in validated_records:
    doi = work.DOI

    if doi not in seen_dois:
        unique_records.append(work)
        seen_dois.add(doi)
    else:
        duplicate_log.append({
            "DOI": doi,
            "title": work.title[0] if work.title else "No Title"
        })


print(f"Number of unique records after de-duplication: {len(unique_records)}")
print(f"Number of duplicate records found and removed: {len(duplicate_log)}")

# Post-Download Filtering by Publisher/Journal Name

if unique_records:
    df = pd.DataFrame([work.model_dump(by_alias=True) for work in unique_records])
    print(f"\nCreated DataFrame with {len(df)} records for filtering.")

    if 'publisher' in df.columns:
        # TODO:EXPAND LIST -- EXPANDED BELOW
        academic_keywords = [
            # --- Core Disciplines & Theories ---
            'Philosophy', 'Philosophie', 'Filosofia', 'Filosofía',
            'Literature', 'Literary', 'Linguistics', 'Poetics',
            'Humanities', 'Theory', 'Critical', 'Deconstruction',
            'Phenomenology', 'Psychoanalysis', 'Aesthetics', 'Cultural Studies',
            
            # --- Institutional & Publisher Types ---
            'University Press', 'University', 'Press', 'Academic',
            'College', 'Institute', 'Institut', 'Centro', 'Centre',
            'Society', 'Société', 'Sociedad',
            
            # --- Publication Types (English) ---
            'Journal', 'Review', 'Studies', 'Quarterly', 'Annual', 'Annals',
            'Proceedings', 'Transactions', 'Bulletin', 'Archive', 'Yearbook',
            
            # --- Publication Types (Foreign Languages) ---
            # French
            'Revue', 'Cahiers', 'Études', 'Annales', 'Presses',
            # German
            'Zeitschrift', 'Kritik', 'Jahrbuch', 'Archiv', 'Verlag',
            # Italian
            'Rivista', 'Studi', 'Annali',
            # Spanish / Portuguese
            'Revista', 'Estudios', 'Anales',
            # Latin
            'Acta'
        ]
        
        search_pattern = '|'.join(academic_keywords)
        
        df_filtered = df[df['publisher'].str.contains(search_pattern, case=False, na=False)]
        
        print(f"\nFound {len(df_filtered)} records from publishers matching academic keywords.")
        
        print("\n Sample")
        display(df_filtered[['DOI', 'title', 'publisher']].head())
        
        
    else:
        print("\nWarning: 'publisher' column not found in the downloaded data. Cannot perform filtering.")
        df.to_csv('crossref_blanchot_unfiltered.csv', index=False)

Downloading: 100%|██████████| 2405/2405 [02:23<00:00, 16.76it/s]



Download complete.
Total validated records: 2405
Number of unique records after de-duplication: 2405
Number of duplicate records found and removed: 0

Created DataFrame with 2405 records for filtering.

Found 1145 records from publishers matching academic keywords.

 Sample


Unnamed: 0,DOI,title,publisher
18,10.1103/physrevlett.80.1658,[Dynamics of Subpicosecond Relativistic Laser ...,American Physical Society (APS)
21,10.3828/ajfs.35.2.228,"[Transgression, Masochism and Subjectivity: th...",Liverpool University Press
24,10.17161/chimeres.v25i1.6161,[Maurice Blanchot: Littérature et ruine de l'é...,The University of Kansas
25,10.1093/fs/lii.4.488,[REVIEWS Maurice Blanchot et le déplacement d'...,Liverpool University Press
26,10.1103/physrevlett.81.4275,[Fuchs<i>et al.</i>Reply:],American Physical Society (APS)


In [6]:
#FILE CREATION
records_saved = df_filtered.to_dict('records')

print(f"Saving {len(records_saved)} filtered records to JSON file...")
with open('crossref_blanchot_filtered.json', mode='w', encoding='utf-8') as f:
    json.dump(records_saved, f, indent=4)
    
print("File 'crossref_blanchot_filtered.json' created successfully.")

Saving 1145 filtered records to JSON file...
File 'crossref_blanchot_filtered.json' created successfully.


In [13]:
import pandas as pd
from tqdm import tqdm
from pydantic import ValidationError
from crossref.restful import Works
from models import CrossrefWorkModel

works_api = Works()

works_query = works_api.query(bibliographic='Blanchot').filter(
    from_pub_date='1998'
).sort('published').order('asc')

def get_cr_work():
    validated_records = []
    failed_records = []

    try:
        for work_data in tqdm(works_query, total=works_query.count(), desc="Downloading"):
            try:
                validated_work = CrossrefWorkModel.model_validate(work_data)
                validated_records.append(validated_work)
            except ValidationError as e:
                failed_records.append({'doi': work_data.get('DOI'), 'error': str(e)})
    except Exception as e:
        print(f"An unexpected error occurred during download: {e}")


    print(f"\nDownload complete.")
    print(f"Total validated records: {len(validated_records)}")
    if failed_records:
        print(f"Total records that failed validation: {len(failed_records)}")

#DEDUPE
    unique_records = []
    seen_dois = set()
    duplicate_log = []

    for work in validated_records:
        doi = work.DOI

        if doi not in seen_dois:
            unique_records.append(work)
            seen_dois.add(doi)
        else:
            duplicate_log.append({
                "DOI": doi,
                "title": work.title[0] if work.title else "No Title"
            })


    print(f"Number of unique records after de-duplication: {len(unique_records)}")
    print(f"Number of duplicate records found and removed: {len(duplicate_log)}")


    df = pd.DataFrame([work.model_dump(by_alias=True) for work in unique_records])
    print(f"\nCreated DataFrame with {len(df)} records for filtering.")

    if 'publisher' in df.columns:
        # TODO:EXPAND LIST -- EXPANDED BELOW
        academic_keywords = [
            # --- Core Disciplines & Theories ---
            'Philosophy', 'Philosophie', 'Filosofia', 'Filosofía',
            'Literature', 'Literary', 'Linguistics', 'Poetics',
            'Humanities', 'Theory', 'Critical', 'Deconstruction',
            'Phenomenology', 'Psychoanalysis', 'Aesthetics', 'Cultural Studies',
            
            # --- Institutional & Publisher Types ---
            'University Press', 'University', 'Press', 'Academic',
            'College', 'Institute', 'Institut', 'Centro', 'Centre',
            'Society', 'Société', 'Sociedad',
            
            # --- Publication Types (English) ---
            'Journal', 'Review', 'Studies', 'Quarterly', 'Annual', 'Annals',
            'Proceedings', 'Transactions', 'Bulletin', 'Archive', 'Yearbook',
            
            # --- Publication Types (Foreign Languages) ---
            # French
            'Revue', 'Cahiers', 'Études', 'Annales', 'Presses',
            # German
            'Zeitschrift', 'Kritik', 'Jahrbuch', 'Archiv', 'Verlag',
            # Italian
            'Rivista', 'Studi', 'Annali',
            # Spanish / Portuguese
            'Revista', 'Estudios', 'Anales',
            # Latin
            'Acta'
        ]
        
        search_pattern = '|'.join(academic_keywords)
        
        df_filtered = df[df['publisher'].str.contains(search_pattern, case=False, na=False)]
        return df_filtered.to_dict('records')
        
    else:
        print("\nWarning: 'publisher' column not found in the downloaded data. Cannot perform filtering.")
        return None