In [11]:
import os
import re
from datetime import datetime
from dateutil import parser

### Extract information from abstract files

In [12]:
def extract_paper_details(file_content):
    # Define regular expressions to extract paper ID, title, and date
    paper_id_pattern = re.compile(r'^Paper:\s*hep-th/(\d+)', re.MULTILINE)
    title_pattern = re.compile(r'^Title:\s*(.*)', re.MULTILINE)
    date_pattern = re.compile(r'^Date:\s*(.*)', re.MULTILINE)
    
    # Extract paper ID
    paper_id_match = paper_id_pattern.search(file_content)
    paper_id = paper_id_match.group(1) if paper_id_match else None
    
    # Extract paper title
    title_match = title_pattern.search(file_content)
    title = title_match.group(1).strip() if title_match else None
    
    # Extract date
    date_match = date_pattern.search(file_content)
    date = date_match.group(1).strip() if date_match else None
    
    # Debug output
    if not paper_id:
        print("No paper ID found")
    if not title:
        print("No title found")
    if not date:
        print("No date found")
    
    return paper_id, title, date

def process_year_folder(year_folder):
    # get info from years' folder
    paper_details = []
    for filename in os.listdir(year_folder):
        if filename.endswith('.abs'):
            file_path = os.path.join(year_folder, filename)
            with open(file_path, 'r', encoding='utf-8', errors='replace') as file:
                file_content = file.read()
                paper_id, title, date = extract_paper_details(file_content)
                if paper_id and title and date:
                    paper_details.append((paper_id, title, date))
                else:
                    print(f"Missing details in file: {filename}")
    return paper_details

def extractDetail(base_directory, output_file):
    with open(output_file, 'w', encoding='utf-8') as outfile:
        for year in range(1992, 2004):
            year_folder = os.path.join(base_directory, str(year))
            if os.path.isdir(year_folder):
                paper_details = process_year_folder(year_folder)
                for paper_id, title, date in paper_details:
                    outfile.write(f"{paper_id}\t{title}\t{date}\n")
            else:
                print(f"Year folder not found: {year_folder}")


In [13]:
base_directory = ''  
output_file = 'paper_details.txt'
extractDetail(base_directory, output_file)

### Clean paper detailes

In [14]:
# Date format convert

def format_full_date(date_str):
    # Convert the date format from 'Wed Dec 23 11:28:33 1992' to '1992-12-23'
    date_pattern = re.compile(r'\b[A-Za-z]{3}\s+([A-Za-z]{3})\s+(\d{2})\s+\d{2}:\d{2}:\d{2}\s+(\d{4})\b')
    match = date_pattern.search(date_str)
    if match:
        month_abbr = match.group(1)
        day = match.group(2)
        year = match.group(3)
        
        month_number = {
            'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 'May': '05', 'Jun': '06',
            'Jul': '07', 'Aug': '08', 'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12'
        }.get(month_abbr, '01')  # 默认为 '01'，即一月
       
        return f"{year}-{month_number}-{day}"
    return ""

# Convert the date format from '1 September 1992 15:32:14 CDT' to '1992-09-01'
def format_full_month_date(date_str):
    date_pattern = re.compile(r'\b(\d{1,2})\s+([A-Za-z]+)\s+(\d{4})\b')
    match = date_pattern.search(date_str)
    if match:
        day = match.group(1)
        month_name = match.group(2)
        year = match.group(3)
        month_number = {
            'January': '01', 'February': '02', 'March': '03', 'April': '04', 'May': '05', 'June': '06',
            'July': '07', 'August': '08', 'September': '09', 'October': '10', 'November': '11', 'December': '12'
        }.get(month_name, '01')  
        
        return f"{year}-{month_number}-{day.zfill(2)}"
    return ""

def clean_date(date_str):
    if re.search(r'\b[A-Za-z]{3}\s+[A-Za-z]{3}\s+\d{2}\s+\d{2}:\d{2}:\d{2}\s+\d{4}\b', date_str):
        return format_full_date(date_str)
    elif re.search(r'\b\d{1,2}\s+[A-Za-z]+\s+\d{4}\b', date_str):
        return format_full_month_date(date_str)
    else:
        short_date_pattern = re.compile(r'(\d{1,2})\s+([A-Za-z]{3})\s+(\d{2,4})')
        match = short_date_pattern.search(date_str)
        if match:
            day = match.group(1)
            month_abbr = match.group(2)
            year = match.group(3)
            if len(year) == 2:
                year = f'19{year}' if int(year) >= 92 else f'20{year}'
            month_number = {
                'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 'May': '05', 'Jun': '06',
                'Jul': '07', 'Aug': '08', 'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12'
            }.get(month_abbr, '01')  
            
            return f"{year}-{month_number}-{day.zfill(2)}"
    return ""

def process_line(line):
    parts = line.split('\t')
    if len(parts) < 3:
        return line.strip()  
    
    paper_id = parts[0].strip()
    title = parts[1].strip()
    date_str = parts[2].strip()

    cleaned_date = clean_date(date_str)
    
    return f"{paper_id}\t{title}\t{cleaned_date}"

def process_file(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as infile:
        lines = infile.readlines()
    
    processed_lines = [process_line(line) for line in lines]
    
    with open(output_file, 'w', encoding='utf-8') as outfile:
        outfile.write('\n'.join(processed_lines))


In [15]:
input_file = 'paper_details.txt'  
output_file = 'updated_paper_details.txt' 
process_file(input_file, output_file)

In [16]:
def split_paper_details(input_file, output_file_dates, output_file_titles):
    with open(input_file, 'r') as infile, \
         open(output_file_dates, 'w') as outfile_dates, \
         open(output_file_titles, 'w') as outfile_titles:
        
        for line in infile:
            # split each line
            parts = line.strip().split('\t')
            
            # make sure each line contains 3 parts
            if len(parts) < 3:
                print(f"Warning: Skipping malformed line: {line.strip()}")
                continue
            
            paper_id = parts[0]
            title = parts[1]
            date = parts[2]
            
            outfile_dates.write(f"{paper_id}\t{date}\n")
            
            outfile_titles.write(f"{paper_id}\t{title}\n")

input_file = 'updated_paper_details.txt'
output_file_dates = 'paper_dates.txt'
output_file_titles = 'paper_titles.txt'

split_paper_details(input_file, output_file_dates, output_file_titles)
