# Executive Orders Preprocessing

## 0- Preparation

In [1]:
# Import Python Packages
import pandas as pd
import re
import json


## 1- Text Data Preprocessing

In [2]:
# Open and load the JSON file
file_path = 'eos_raw.json'
with open(file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)
    

In [3]:
# Convert JSON data to a pandas DataFrame
df = pd.DataFrame(data)
df.head()


Unnamed: 0,title,president,publication_date,signing_date,citation,document_number,executive_order_number,pdf_url,toc_subject,disposition_notes,full_text
0,Combating Emerging Firearms Threats and Improv...,"{'identifier': 'joe-biden', 'name': 'Joseph R....",2024-10-02,2024-09-26,89 FR 80345,2024-22938,14127,https://www.govinfo.gov/content/pkg/FR-2024-10...,,,"<html>\n<head>\n<title>Federal Register, Volum..."
1,Investing in America and Investing in American...,"{'identifier': 'joe-biden', 'name': 'Joseph R....",2024-09-11,2024-09-06,89 FR 73559,2024-20712,14126,https://www.govinfo.gov/content/pkg/FR-2024-09...,Federal Government:\n,"See: EO 11246, September 24, 1965; EO 13985, J...","<html>\n<head>\n<title>Federal Register, Volum..."
2,Establishing an Emergency Board To Investigate...,"{'identifier': 'joe-biden', 'name': 'Joseph R....",2024-07-29,2024-07-24,89 FR 60791,2024-16740,14125,https://www.govinfo.gov/content/pkg/FR-2024-07...,"Committees; Establishment, Renewal, Terminatio...",,"<html>\n<head>\n<title>Federal Register, Volum..."
3,White House Initiative on Advancing Educationa...,"{'identifier': 'joe-biden', 'name': 'Joseph R....",2024-07-22,2024-07-17,89 FR 59585,2024-16225,14124,https://www.govinfo.gov/content/pkg/FR-2024-07...,Education:\n,"See: EO 14045, September 13, 2021","<html>\n<head>\n<title>Federal Register, Volum..."
4,White House Council on Supply Chain Resilience,"{'identifier': 'joe-biden', 'name': 'Joseph R....",2024-06-21,2024-06-14,89 FR 51949,2024-13810,14123,https://www.govinfo.gov/content/pkg/FR-2024-06...,"Committees; Establishment, Renewal, Terminatio...","See: EO 14017, February 24, 2021\r\nSupersedes...","<html>\n<head>\n<title>Federal Register, Volum..."


In [4]:
# Function to clean the text data
def clean_text(text):
    if pd.notna(text):
        # Remove HTML tags
        text = re.sub(r'<[^>]+>', '', text)
        # Remove page markers, e.g., [[Page 51953]]
        text = re.sub(r'\[\[Page \d+\]\]', '', text)
        # Find and extract text starting from 'Executive Order' up to '(Presidential Sig.)'
        match = re.search(r'Executive Order.*?(?=\(Presidential Sig\.\))', text, re.S|re.I)
        if match:
            # Remove newline characters and return cleaned text
            return re.sub(r'\n', ' ', match.group(0))
    return text

# Apply the function
df['cleaned_text'] = df['full_text'].apply(clean_text)


## 3- Quality Check


In [5]:
# Display the rows of the 'cleaned_text' column
pd.set_option('display.max_colwidth', None)
print(df['cleaned_text'].head(5))


0    Executive Order 14127--Combating Emerging Firearms Threats and  Improving School-Based Active-Shooter Drills    Proclamation 10817--Amending Proclamation 10773    Proclamation 10818--Gold Star Mother's and Family's Day, 2024    Proclamation 10819--National Hunting and Fishing Day, 2024    Proclamation 10820--National Public Lands Day, 2024                              Presidential Documents         Federal Register / Vol. 89 , No. 191 / Wednesday, October 2, 2024 /  Presidential Documents     ___________________________________________________________________   Title 3--  The President                    Executive Order 14127 of September 26, 2024                   Combating Emerging Firearms Threats and Improving                  School-Based Active-Shooter Drills                  By the authority vested in me as President by the                  Constitution and the laws of the United States of                  America, it is hereby ordered as follows:                  Section 1

In [6]:
# Get the number of rows
df.shape[0]


1258

## 4- Data Output

In [7]:
# Export the DataFrame to a JSON file
output_file = 'eos_final.json'
df.to_json(output_file, orient='records', lines=True, indent=4)
print(f"Data exported")


Data exported
