In [None]:
!pip3 install scholarly

In [None]:
import scholarly

In [19]:
from scholarly import scholarly
import re
import requests
import unicodedata
from datetime import datetime
import time

def sanitize_key(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode()
    return re.sub(r'\W+', '', text)

def extract_first_word(title):
    words = re.findall(r'\b\w+\b', title)
    return sanitize_key(words[0]) if words else "Untitled"

def guess_entry_type(pub):
    bib = pub.get('bib', {})
    if 'journal' in bib:
        return 'article'
    elif 'title' in bib:
        return 'inproceedings'
    else:
        return 'misc'

def fetch_booktitle_via_crossref(title, year, retries=7, delay=3):
    """Try CrossRef up to `retries` times with exponential backoff on timeout."""
    for attempt in range(retries):
        try:
            url = "https://api.crossref.org/works"
            params = {
                "query.title": title,
                "rows": 1,
                "filter": f"from-pub-date:{year},until-pub-date:{year}"
            }
            resp = requests.get(url, params=params, timeout=10)
            resp.raise_for_status()
            data = resp.json()
            items = data.get("message", {}).get("items", [])
            if items:
                container = items[0].get("container-title", [])
                return container[0] if container else ""
            break
        except Exception as e:
            print(f"⚠️ CrossRef error (attempt {attempt+1}/{retries}) for '{title}': {e}")
            if attempt < retries - 1:
                time.sleep(delay * (2 ** attempt))  # exponential backoff
    return ""

def format_bibtex(pub, entry_type):
    bib = pub.get('bib', {})
    title = bib.get('title', 'Unknown Title')
    authors = bib.get('author', 'Unknown Author')
    year = bib.get('pub_year', '????')
    month = bib.get('pub_month', datetime.now().month)

    journal = bib.get('journal', '')
    booktitle = bib.get('booktitle', '') or fetch_booktitle_via_crossref(title, year)

    first_author = authors.split(' and ')[0].split()[-1]
    first_word = extract_first_word(title)
    key = f"{first_author}{year}{first_word}"

    lines = [f"@{entry_type}{{{key},",
             f"  author = {{{authors}}},",
             f"  title = {{{title}}},"]

    if entry_type == 'inproceedings' and booktitle:
        lines.append(f"  booktitle = {{{booktitle}}},")
    if entry_type == 'article' and journal:
        lines.append(f"  journal = {{{journal}}},")

    lines.extend([
        f"  month = {{{month}}},",
        f"  year = {{{year}}},",
        f"  bibtex_show = {{true}}",
        f"}}\n"
    ])
    return "\n".join(lines)

def fetch_and_save(user_id, output_file='scholar.bib', max_pubs=None):
    author = scholarly.search_author_id(user_id)
    author = scholarly.fill(author, sections=['publications'])
    entries = {'article': [], 'inproceedings': [], 'misc': []}

    for i, pub in enumerate(author['publications']):
        if max_pubs and i >= max_pubs:
            break
        filled = scholarly.fill(pub)
        et = guess_entry_type(filled)
        bibtex = format_bibtex(filled, et)
        entries[et].append(bibtex)
        print(f"✔ {et}: {filled['bib'].get('title', '')}")

    with open(output_file, 'w', encoding='utf-8') as f:
        for et in ['article', 'inproceedings', 'misc']:
            if entries[et]:
                f.write(f"% === {et.upper()} ===\n\n")
                f.writelines(entries[et])

    print(f"\n✅ Saved {sum(len(v) for v in entries.values())} entries to {output_file}")

# Run it
fetch_and_save('1g1i1B4AAAAJ', max_pubs=None)

KeyboardInterrupt: 

In [35]:
import bibtexparser
import re, unicodedata, urllib.parse, requests, difflib, concurrent.futures

def sanitize_key(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode()
    return re.sub(r'\W+', '', text)

def extract_first_word(title):
    words = re.findall(r'\b\w+\b', title)
    return sanitize_key(words[0]) if words else "Untitled"

def fetch_dblp_match_info(title, max_hits=5, threshold=0.85):
    q = urllib.parse.quote(title)
    url = f"https://dblp.org/search/publ/api?q={q}&h={max_hits}&format=json"
    try:
        resp = requests.get(url, timeout=10).json()
        hits = resp.get('result', {}).get('hits', {}).get('hit', [])
        print(f"🔍 DBLP search for '{title}' returned {len(hits)} hits")
        if not hits:
            return None
        best_match = None
        best_score = 0
        for hit in hits:
            candidate_title = hit['info'].get('title', '')
            score = difflib.SequenceMatcher(None, candidate_title.lower(), title.lower()).ratio()
            if score > best_score:
                best_score = score
                best_match = hit
        if best_match and best_score >= threshold:
            return best_match['info']
    except Exception as e:
        print(f"❌ DBLP lookup failed for '{title}': {e}")
    return None

def format_inproceedings_custom(entry, match_info):
    title = entry.get('title', 'Unknown Title')
    authors = entry.get('author', 'Unknown Author')
    year = entry.get('year', '????')
    month = '7'  # guess or override as needed

    first_author = authors.split(' and ')[0].split()[-1]
    first_word = extract_first_word(title)
    key = f"{first_author}{year}{first_word}"

    venue = match_info.get('venue', '')
    booktitle = match_info.get('booktitle', '') or venue
    pages = match_info.get('pages', '')
    doi = match_info.get('doi', '')
    ee = match_info.get('ee', '')
    conf_abbr = match_info.get('key', '').split('/')[1].upper() if '/' in match_info.get('key', '') else 'CONF'
    eventdate = f"{year}-07-19"

    return f"""@inproceedings{{{key},
  author = {{{authors}}},
  booktitle = {{{booktitle}}},
  title = {{{title}}},
  venue = {{{venue}}},
  eventdate = {{{eventdate}}},
  month = {{{month}}},
  year = {{{year}}},
  openreview = {{{ee}}},
  bibtex_show = {{true}},
  ABBR = {{{conf_abbr}}}
}}\n"""

def format_article(entry):
    title = entry.get('title', 'Unknown')
    authors = entry.get('author', 'Unknown Author')
    year = entry.get('year', '????')
    journal = entry.get('journal', '')
    month = '7'
    abbr = journal.split()[0].upper() if journal else "JOUR"
    key = f"{authors.split(' and ')[0].split()[-1]}{year}{extract_first_word(title)}"

    return f"""@article{{{key},
  author = {{{authors}}},
  title = {{{title}}},
  journal = {{{journal}}},
  month = {{{month}}},
  year = {{{year}}},
  bibtex_show = {{true}},
  ABBR = {{{abbr}}}
}}\n"""

def format_misc(entry):
    title = entry.get('title', 'Unknown')
    authors = entry.get('author', 'Unknown Author')
    year = entry.get('year', '????')
    month = '7'
    key = f"{authors.split(' and ')[0].split()[-1]}{year}{extract_first_word(title)}"
    return f"""@misc{{{key},
  author = {{{authors}}},
  title = {{{title}}},
  month = {{{month}}},
  year = {{{year}}},
  bibtex_show = {{true}},
  ABBR = {{MISC}}
}}\n"""

def process_entry(entry):
    title = entry.get('title', '')
    if 'journal' in entry:
        return 'article', format_article(entry)
    match_info = fetch_dblp_match_info(title)
    if match_info:
        return 'inproceedings', format_inproceedings_custom(entry, match_info)
    return 'misc', format_misc(entry)

def process_bib_file(input_file='css.bib', output_file='converted.bib'):
    with open(input_file, 'r', encoding='utf-8') as bibtex_file:
        bib_database = bibtexparser.load(bibtex_file)

    print(f"📚 Loaded {len(bib_database.entries)} entries from {input_file}")
    entries = bib_database.entries

    # Run in parallel
    print("🚀 Processing entries with DBLP in parallel...")
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        results = list(executor.map(process_entry, entries))

    # Collect formatted BibTeX
    articles, inprocs, miscs = [], [], []
    for typ, entry in results:
        if typ == 'article':
            articles.append(entry)
        elif typ == 'inproceedings':
            inprocs.append(entry)
        else:
            miscs.append(entry)

    with open(output_file, 'w', encoding='utf-8') as f:
        if inprocs:
            f.write("% === INPROCEEDINGS ===\n\n")
            f.writelines(inprocs)
        if articles:
            f.write("% === ARTICLES ===\n\n")
            f.writelines(articles)
        if miscs:
            f.write("% === MISC ===\n\n")
            f.writelines(miscs)

    print(f"\n✅ Written to {output_file}: {len(inprocs)} inproceedings, {len(articles)} articles, {len(miscs)} misc")

# Run the converter
process_bib_file('css.bib', 'converted.bib')

📚 Loaded 292 entries from css.bib
🚀 Processing entries with DBLP in parallel...
🔍 DBLP search for 'Analog Communication' returned 5 hits
🔍 DBLP search for 'IEEE Signal Processing Society' returned 5 hits
🔍 DBLP search for 'AN EFFICIENT FULLY CONNECTED NEURAL NETWORK FOR MICROANEURYSM DETECTION FROM RETINAL FUNDUS IMAGES' returned 0 hits
🔍 DBLP search for 'Spider GAN: Leveraging Friendly Neighbors to Accelerate GAN Training Supporting Document' returned 0 hits
🔍 DBLP search for 'Classification of Abnormalities in WCE' returned 0 hits
🔍 DBLP search for 'Teaching a GAN What Not to Learn (Supplementary Material)' returned 0 hits
🔍 DBLP search for 'Denoising Enhances Visualization of Optical Coherence Tomography Images' returned 0 hits
🔍 DBLP search for 'SAMIR: SPARSITY AMPLIFIED BEAMFORMING FOR HIGH-RESOLUTION ULTRASOUND IMAGING' returned 0 hits
🔍 DBLP search for 'High Precision Target Localization Using a Sub-Nyquist Super-Resolution Radar' returned 0 hits
🔍 DBLP search for 'Accelerated D

In [38]:
import bibtexparser
import re, unicodedata, urllib.parse, requests, difflib, concurrent.futures

def sanitize_key(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode()
    return re.sub(r'\W+', '', text)

def extract_first_word(title):
    words = re.findall(r'\b\w+\b', title)
    return sanitize_key(words[0]) if words else "Untitled"

def fetch_dblp_match_info(title, max_hits=5, threshold=0.85):
    q = urllib.parse.quote(title)
    url = f"https://dblp.org/search/publ/api?q={q}&h={max_hits}&format=json"
    try:
        resp = requests.get(url, timeout=10).json()
        hits = resp.get('result', {}).get('hits', {}).get('hit', [])
        print(f"🔍 DBLP search for '{title}' returned {len(hits)} hits")
        if not hits:
            return None
        best_match = None
        best_score = 0
        for hit in hits:
            candidate_title = hit['info'].get('title', '')
            score = difflib.SequenceMatcher(None, candidate_title.lower(), title.lower()).ratio()
            if score > best_score:
                best_score = score
                best_match = hit
        if best_match and best_score >= threshold:
            return best_match['info']
    except Exception as e:
        print(f"❌ DBLP lookup failed for '{title}': {e}")
    return None

def format_inproceedings_custom(entry, match_info):
    title = entry.get('title', 'Unknown Title')
    authors = entry.get('author', 'Unknown Author')
    year = entry.get('year', '????')
    month = '7'  # guess or override as needed

    first_author = authors.split(' and ')[0].split()[-1]
    first_word = extract_first_word(title)
    key = f"{first_author}{year}{first_word}"

    venue = match_info.get('venue', '')
    booktitle = match_info.get('booktitle', '') or venue
    pages = match_info.get('pages', '')
    doi = match_info.get('doi', '')
    ee = match_info.get('ee', '')
    conf_abbr = match_info.get('key', '').split('/')[1].upper() if '/' in match_info.get('key', '') else 'CONF'
    eventdate = f"{year}-07-19"

    return f"""@inproceedings{{{key},
  author = {{{authors}}},
  booktitle = {{{booktitle}}},
  title = {{{title}}},
  venue = {{{venue}}},
  eventdate = {{{eventdate}}},
  month = {{{month}}},
  year = {{{year}}},
  openreview = {{{ee}}},
  bibtex_show = {{true}},
  ABBR = {{{conf_abbr}}}
}}\n"""

def format_article(entry):
    title = entry.get('title', 'Unknown')
    authors = entry.get('author', 'Unknown Author')
    year = entry.get('year', '????')
    journal = entry.get('journal', '')
    month = '7'
    abbr = journal.split()[0].upper() if journal else "JOUR"
    key = f"{authors.split(' and ')[0].split()[-1]}{year}{extract_first_word(title)}"

    return f"""@article{{{key},
  author = {{{authors}}},
  title = {{{title}}},
  journal = {{{journal}}},
  month = {{{month}}},
  year = {{{year}}},
  bibtex_show = {{true}},
  ABBR = {{{abbr}}}
}}\n"""

def format_misc(entry):
    title = entry.get('title', 'Unknown')
    authors = entry.get('author', 'Unknown Author')
    year = entry.get('year', '????')
    month = '7'
    key = f"{authors.split(' and ')[0].split()[-1]}{year}{extract_first_word(title)}"
    return f"""@misc{{{key},
  author = {{{authors}}},
  title = {{{title}}},
  month = {{{month}}},
  year = {{{year}}},
  bibtex_show = {{true}},
  ABBR = {{MISC}}
}}\n"""

file = set()
def process_entry(entry):
    title = entry.get('title', '')
    print(entry.get("type" , ""))
    file.add(entry.get("type" , ""))
    if 'journal' in entry:
        return 'article', format_article(entry)
    match_info = fetch_dblp_match_info(title)
    if match_info:
        return 'inproceedings', format_inproceedings_custom(entry, match_info)
    return 'misc', format_misc(entry)

def process_bib_file(input_file='css.bib', output_file='converted.bib'):
    with open(input_file, 'r', encoding='utf-8') as bibtex_file:
        bib_database = bibtexparser.load(bibtex_file)

    print(f"📚 Loaded {len(bib_database.entries)} entries from {input_file}")
    entries = bib_database.entries

    # Run in parallel
    print("🚀 Processing entries with DBLP in parallel...")
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        results = list(executor.map(process_entry, entries))

    # # Collect formatted BibTeX
    # articles, inprocs, miscs = [], [], []
    # for typ, entry in results:
    #     if typ == 'article':
    #         articles.append(entry)
    #     elif typ == 'inproceedings':
    #         inprocs.append(entry)
    #     else:
    #         miscs.append(entry)

    # with open(output_file, 'w', encoding='utf-8') as f:
    #     if inprocs:
    #         f.write("% === INPROCEEDINGS ===\n\n")
    #         f.writelines(inprocs)
    #     if articles:
    #         f.write("% === ARTICLES ===\n\n")
    #         f.writelines(articles)
    #     if miscs:
    #         f.write("% === MISC ===\n\n")
    #         f.writelines(miscs)

    # print(f"\n✅ Written to {output_file}: {len(inprocs)} inproceedings, {len(articles)} articles, {len(miscs)} misc")

# Run the converter
process_bib_file('css.bib', 'converted.bib')

📚 Loaded 292 entries from css.bib
🚀 Processing entries with DBLP in parallel...
Journal article
Journal article
Journal article
Journal article
Journal article
Conference paper
Conference paper
Conference paper
Journal article
Journal article
Journal article
Journal article
Journal article
Journal article
Journal article
Journal article
Conference paper
Journal article
Journal article
Conference paper
Journal article
Conference paper
Journal article
Journal article
Journal article
Conference paper
Journal article
Journal article
Conference paper

Journal article
Journal article
Journal article
Journal article
Conference paper
Patent
Journal article
Journal article
Conference paper
Journal article
Preprint
Journal article
Conference paper
Conference paper
Journal article
Journal article
Journal article
Journal article
Journal article
Conference paper
Conference paper
Journal article
Conference paper
Conference paper
Journal article
Preprint
Preprint
Conference paper
Journal article
Conf

In [None]:
import re
import unicodedata
import difflib
import urllib.parse
import requests
import bibtexparser
from bibtexparser.bparser import BibTexParser
from concurrent.futures import ThreadPoolExecutor
from collections import defaultdict

def sanitize_key(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode()
    return re.sub(r'\W+', '', text)

def extract_first_word(title):
    words = re.findall(r'\b\w+\b', title)
    return sanitize_key(words[0]) if words else "Untitled"

def guess_bibtex_type(entry_type):
    entry_type = (entry_type or "").strip().lower()
    if entry_type == "journal article":
        return "article"
    elif entry_type == "conference paper":
        return "inproceedings"
    elif entry_type == "book":
        return "book"
    elif entry_type == "book chapter":
        return "inbook"
    elif entry_type in {"preprint", "patent", "abstract", ""}:
        return "misc"
    else:
        return "misc"

def fetch_dblp_match_info(title, max_hits=5, threshold=0.85):
    q = urllib.parse.quote(title)
    url = f"https://dblp.org/search/publ/api?q={q}&h={max_hits}&format=json"
    try:
        resp = requests.get(url, timeout=10).json()
        hits = resp.get('result', {}).get('hits', {}).get('hit', [])
        if not hits:
            return None
        best_match = max(hits, key=lambda h: difflib.SequenceMatcher(None, h['info'].get('title', '').lower(), title.lower()).ratio())
        score = difflib.SequenceMatcher(None, best_match['info'].get('title', '').lower(), title.lower()).ratio()
        return best_match['info'] if score >= threshold else None
    except Exception as e:
        print(f"\U0001f50d DBLP lookup failed for '{title}': {e}")
    return None

def format_entry(entry, dblp_info=None):
    entry_type = guess_bibtex_type(entry.get('type', ''))
    title = entry.get('title', 'Unknown Title')
    authors = entry.get('author', 'Unknown Author')
    year = str(entry.get('year', '????'))
    first_author = authors.split(' and ')[0].split()[-1]
    first_word = extract_first_word(title)
    key = f"{first_author}{year}{first_word}"

    if entry_type == "article":
        journal = entry.get('journal', '')
        return year, f"""@article{{{key},
  author = {{{authors}}},
  title = {{{title}}},
  journal = {{{journal}}},
  year = {{{year}}}
}}\n"""

    elif entry_type == "inproceedings":
        if year == "????" and dblp_info:
            year = dblp_info.get('year', '????')

        venue = dblp_info.get('venue', '') if dblp_info else ''
        booktitle = entry.get('journal', '') if dblp_info else venue
        ee = dblp_info.get('ee', '') if dblp_info else ''
        conf_abbr = dblp_info.get('key', '').split('/')[1].upper() if dblp_info and '/' in dblp_info.get('key', '') else 'CONF'
        eventdate = f"{year}-07-19"
        return year, f"""@inproceedings{{{key},
  author = {{{authors}}},
  booktitle = {{{booktitle}}},
  title = {{{title}}},
  venue = {{{venue}}},
  eventdate = {{{eventdate}}},
  month = {{7}},
  year = {{{year}}},
  openreview = {{{ee}}},
  bibtex_show = {{true}},
  ABBR = {{{conf_abbr}}}
}}\n"""

    elif entry_type == "book":
        publisher = entry.get('publisher', '')
        return year, f"""@book{{{key},
  author = {{{authors}}},
  title = {{{title}}},
  publisher = {{{publisher}}},
  year = {{{year}}}
}}\n"""

    elif entry_type == "inbook":
        booktitle = entry.get('booktitle', '')
        pages = entry.get('pages', '')
        return year, f"""@inbook{{{key},
  author = {{{authors}}},
  title = {{{title}}},
  booktitle = {{{booktitle}}},
  pages = {{{pages}}},
  year = {{{year}}}
}}\n"""

    else:
        return year, f"""@misc{{{key},
  author = {{{authors}}},
  title = {{{title}}},
  year = {{{year}}}
}}\n"""

def process_entry(entry):
    entry_type = guess_bibtex_type(entry.get('type', ''))
    entry

def convert_bib_file(input_bib, output_bib):
    with open(input_bib, 'r', encoding='utf-8') as f:
        bib_database = BibTexParser(common_strings=True).parse_file(f)
        entries = bib_database.entries

    with ThreadPoolExecutor(max_workers=10) as executor:
        formatted = list(executor.map(process_entry, entries))

    sorted_by_year = defaultdict(list)
    for year, entry in formatted:
        sorted_by_year[year].append(entry)

    with open(output_bib, 'w', encoding='utf-8') as f:
        for year in sorted(sorted_by_year.keys(), reverse=True):
            f.write(f"% === {year} ===\n\n")
            for entry in sorted_by_year[year]:
                f.write(entry + '\n')

    print(f"✅ Converted {len(entries)} entries and saved to {output_bib}")

# Example usage:
convert_bib_file("filtered_css.bib", "papers_css.bib")


✅ Converted 234 entries and saved to papers_css.bib
