In [11]:
import os
import re
import pandas as pd
from lxml import etree
from glob import glob
from pathlib import Path

In [12]:
folder = Path("data/nyt_xml_test")
xml_files = list(folder.glob("*.xml"))

In [None]:
#TEST
sample_file = xml_files[1]
print(f"Reading {sample_file}")

tree = etree.parse(sample_file)
root = tree.getroot()

Reading data/nyt_xml_test/91676921.xml


In [14]:
def clean_whitespace(s: str) -> str:
    if s is None:
        return ""
    s = re.sub(r'\s+', ' ', s)
    return s.strip()

In [None]:
#TEST
texts = []
for block in root.findall(".//FullText"):  
    texts.append(block.text)

article_text = "\n".join([t for t in texts if t])
print(article_text[:5000])  # first 5k chars

                                                                                                AFFAIRS AT .                 The Doings of the Rebel Raiders There  How They Came and Saw and Conquered  The Property Destroyed by Them, &c.                 , Monday, Oct. 13, 1862.                 On Friday afternoon last, a dispatch was received by the telegraph operator here, from Greencastle, stating that information reached there that about 3,000 rebel cavalry had passed through Mercersburgh, and were going in the direction of Chambersburgh. This information was not made public, or the reason that it was not credited.                 ing, about 5 o clock, an express rider came in from St, Thomas, a village seven miles west from here, who said they had reached that place, had torn down the flak and chased him within three miles of Chambersburgh. Shortly after, other riders came in, confirming this statement. Our people were ails incredulous, but to do wh^t we could, the Court-house bell 

In [None]:
#TEST
clean_whitespace(article_text)

'AFFAIRS AT . The Doings of the Rebel Raiders There How They Came and Saw and Conquered The Property Destroyed by Them, &c. , Monday, Oct. 13, 1862. On Friday afternoon last, a dispatch was received by the telegraph operator here, from Greencastle, stating that information reached there that about 3,000 rebel cavalry had passed through Mercersburgh, and were going in the direction of Chambersburgh. This information was not made public, or the reason that it was not credited. ing, about 5 o clock, an express rider came in from St, Thomas, a village seven miles west from here, who said they had reached that place, had torn down the flak and chased him within three miles of Chambersburgh. Shortly after, other riders came in, confirming this statement. Our people were ails incredulous, but to do wh^t we could, the Court-house bell was rung, and the Horue Guards, about one hundred in all, turned out. About 7 o clock, some fifteen cavalry rode into the public square, with a white handkerchie

In [None]:
def extract_fields_from_xml(file_path):
    try:
        tree = etree.parse(file_path)
        root = tree.getroot()
        data = {
            'pub_date': None,
            'object_type': None,
            'raw_text': None
        }

        # Extract dates
        date_element = root.find('.//NumericPubDate')
        if date_element is not None and date_element.text:
            data['pub_date'] = date_element.text.strip()
        
        # extract object types
        object_types = root.findall('.//ObjectType')
        if object_types:
            # Join all unique object types into a single string for a clean Parquet column
            unique_types = set(o.text.strip() for o in object_types if o.text)
            data['object_type'] = " | ".join(sorted(list(unique_types)))
        
        # extract article text
        text_element = root.find('.//FullText')
        if text_element is not None and text_element.text:
            raw_text = text_element.text
            # strip extra whitespace
            data['raw_text'] = clean_whitespace(raw_text)

        # Check for mandatory fields before returning
        if not data['pub_date'] or not data['raw_text']:
            # Skip records missing essential data
            print(f"Warning: Skipping {file_path} due to missing date or text.")
            return None
        
        return data

    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return None

In [19]:
def main_processing(xml_directory, output_parquet_file):
    all_xml_files = glob(os.path.join(xml_directory, '*.xml')) 
    print(f"Found {len(all_xml_files)} XML files to process.")
    
    # List to hold the extracted records
    records = []
    
    for i, file_path in enumerate(all_xml_files):
        record = extract_fields_from_xml(file_path)
        if record:
            records.append(record)
        
        if (i + 1) % 1000 == 0:
            print(f"Processed {i + 1} files...")

    # Convert list of records to a DataFrame
    df = pd.DataFrame(records)
    
    print(f"\nSuccessfully extracted data from {len(df)} records.")
    
    # Write the DataFrame to the highly compressed Parquet format
    # Using 'pyarrow' engine is standard for robust Parquet conversion
    df.to_parquet(output_parquet_file, engine='pyarrow', index=False, compression='snappy')
    
    print(f"Data written successfully to {output_parquet_file}")
    print(f"Final DataFrame Shape: {df.shape}")


    
main_processing("data/nyt_xml_test", "data/nyt_sample.parquet")

Found 25000 XML files to process.
Processed 1000 files...
Processed 2000 files...
Processed 3000 files...
Processed 4000 files...
Processed 5000 files...
Processed 6000 files...
Processed 7000 files...
Processed 8000 files...
Processed 9000 files...
Processed 10000 files...
Processed 11000 files...
Processed 12000 files...
Processed 13000 files...
Processed 14000 files...
Processed 15000 files...
Processed 16000 files...
Processed 17000 files...
Processed 18000 files...
Processed 19000 files...
Processed 20000 files...
Processed 21000 files...
Processed 22000 files...
Processed 23000 files...
Processed 24000 files...
Processed 25000 files...

Successfully extracted data from 24908 records.
Data written successfully to data/nyt_sample.parquet
Final DataFrame Shape: (24908, 3)


In [None]:
#TEST

parquet_sample = pd.read_parquet("data/nyt_sample.parquet")
print(parquet_sample.head())
print(parquet_sample.info())
print(parquet_sample.describe())


   pub_date                               object_type  \
0  18710312  Advertisement | Classified Advertisement   
1  18621016                    Front Page/Cover Story   
2  18850105                         Article | Feature   
3  18810317                    Front Page/Cover Story   
4  18791230                         Article | Feature   

                                            raw_text  
0  REAL AT AUCTIOi:. AI) H. , Auctioneer. 120 CgN...  
1  AFFAIRS AT . The Doings of the Rebel Raiders T...  
2  AN ATTEMPT TO BREAK JAIL FAILS. CHICAGO, Jan. ...  
3  THE IRISH RENT TROUBLES. REPORTS OF AN ATTACK ...  
4  ARTIFICIAL DIAMONDS. A >;uropean has ma,do a <...  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24908 entries, 0 to 24907
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   pub_date     24908 non-null  object
 1   object_type  24908 non-null  object
 2   raw_text     24908 non-null  object
dtypes: object