# **Generating sentence-level-analysis-ready txt files**

In [None]:
import os
def load_files_from_directory(directory_path):
    """
    Loads all files from the specified directory.

    Args:
        directory_path (str): The path to the directory.

    Returns:
        list: A list of file paths.
    """
    file_names = [] # with no file format mark

    try:
        for filename in os.listdir(directory_path):
            filepath = os.path.join(directory_path, filename)
            if os.path.isfile(filepath):
                file_name = filename
                file_names.append(file_name)

    except FileNotFoundError:
        print(f"Error: Directory not found: {directory_path}")
        return None
    except Exception as e:
         print(f"An error occurred: {e}")
         return None

    print("Number of files in this folder: ", len(file_names))
    return file_names

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
root = '/content/drive/MyDrive/national_cybersecurity_policies_data/national_cybersecurity_strategies_v1.0/Files_Organizer'
folder_inaug = '/Non-Marker_Inaugural_TXTs'
folder_seq = '/Non-Marker_Sequential_TXTs'
file_names_inaug = load_files_from_directory(root + folder_inaug)
file_names_seq = load_files_from_directory(root + folder_seq)
print(file_names_inaug)
print(file_names_seq)

Number of files in this folder:  24
Number of files in this folder:  23
['20150723_Ghana_National Cybersecurity Strategy_ENG.txt', '20161027_Sierra Leone_National Cybersecurity Strategy_ENG.txt', '20110629_Lithuania_National Cybersecurity Strategy_ENG.txt', '20130101_Spain_National Cybersecurity Strategy_ESP.txt', '20160223_Uzbekistan_National Cybersecurity Strategy_RUS.txt', '20140101_Latvia_National Cybersecurity Strategy_LTV.txt', '20180717_North Macedonia_National Cybersecurity Strategy_ENG.txt', '20190601_Lebanon_National Cybersecurity Strategy_ENG.txt', '20200101_Belize_National Cybersecurity Strategy_ENG.txt', '20181101_Sri Lanka_National Cybersecurity Strategy_ENG.txt', '20170101_Indonesia_National Cybersecurity Strategy_ENG.txt', '20240101_Togo_National Cybersecurity Strategy_FRN.txt', '20110802_South Korea_National Cybersecurity Strategy_ENG.txt', '20140417_Uruguay_National Cybersecurity Strategy_ESP.txt', '20200101_Benin_National Cybersecurity Strategy_FRN.txt', '20120831_Ch

# **For Non-Marker_Inaugural_TXTs**

In [None]:
from pathlib import Path
import re

for file_name in file_names_inaug:
  # Load the uploaded file
  input_path = Path(root + folder_inaug + "/" + file_name)
  output_folder_name = '/Cleaned_Non-Marker_Inaugural_TXTs'
  output_path = Path(root + output_folder_name + "/" + file_name)
  print(input_path)
  print(output_path)

  # Read content
  text = input_path.read_text(encoding='utf-8')

  # Remove bullet points, numbers, and symbols commonly used in lists or indexes
  text = re.sub(r"^\s*[\d\.\-\–•]+", "", text, flags=re.MULTILINE)

  # Remove lines with fewer than 5 characters (e.g., headings, numbers, or formatting leftovers)
  lines = text.splitlines()
  lines = [line.strip() for line in lines if len(line.strip()) >= 5]

  # Combine lines into a single string and split into sentences using punctuation
  text_combined = " ".join(lines)

  # Normalize whitespace
  text_combined = re.sub(r'\s+', ' ', text_combined)

  # Split into sentences using regex (handle basic sentence delimiters)
  sentences = re.split(r'(?<=[.!?]) +', text_combined)

  # Write sentences to file with one sentence per line
  output_path.write_text("\n\n\n".join(sentences), encoding='utf-8')

  print(output_path.name)

/content/drive/MyDrive/national_cybersecurity_policies_data/national_cybersecurity_strategies_v1.0/Files_Organizer/Non-Marker_Inaugural_TXTs/20150723_Ghana_National Cybersecurity Strategy_ENG.txt
/content/drive/MyDrive/national_cybersecurity_policies_data/national_cybersecurity_strategies_v1.0/Files_Organizer/Cleaned_Non-Marker_Inaugural_TXTs/20150723_Ghana_National Cybersecurity Strategy_ENG.txt
20150723_Ghana_National Cybersecurity Strategy_ENG.txt
/content/drive/MyDrive/national_cybersecurity_policies_data/national_cybersecurity_strategies_v1.0/Files_Organizer/Non-Marker_Inaugural_TXTs/20161027_Sierra Leone_National Cybersecurity Strategy_ENG.txt
/content/drive/MyDrive/national_cybersecurity_policies_data/national_cybersecurity_strategies_v1.0/Files_Organizer/Cleaned_Non-Marker_Inaugural_TXTs/20161027_Sierra Leone_National Cybersecurity Strategy_ENG.txt
20161027_Sierra Leone_National Cybersecurity Strategy_ENG.txt
/content/drive/MyDrive/national_cybersecurity_policies_data/national_

# **For Non-Marker_Sequential_TXTs**

In [None]:
for file_name in file_names_seq:
  # Load the uploaded file
  input_path = Path(root + folder_seq + "/" + file_name)
  output_folder_name = '/Cleaned_Non-Marker_Sequential_TXTs'
  output_path = Path(root + output_folder_name + "/" + file_name)
  print(input_path)
  print(output_path)

  # Read content
  text = input_path.read_text(encoding='utf-8')

  # Remove bullet points, numbers, and symbols commonly used in lists or indexes
  text = re.sub(r"^\s*[\d\.\-\–•]+", "", text, flags=re.MULTILINE)

  # Remove lines with fewer than 5 characters (e.g., headings, numbers, or formatting leftovers)
  lines = text.splitlines()
  lines = [line.strip() for line in lines if len(line.strip()) >= 5]

  # Combine lines into a single string and split into sentences using punctuation
  text_combined = " ".join(lines)

  # Normalize whitespace
  text_combined = re.sub(r'\s+', ' ', text_combined)

  # Split into sentences using regex (handle basic sentence delimiters)
  sentences = re.split(r'(?<=[.!?]) +', text_combined)

  # Write sentences to file with one sentence per line
  output_path.write_text("\n\n\n".join(sentences), encoding='utf-8')

  print(output_path.name)

/content/drive/MyDrive/national_cybersecurity_policies_data/national_cybersecurity_strategies_v1.0/Files_Organizer/Non-Marker_Sequential_TXTs/20210101_Costa Rica_National Cybersecurity Strategy_ESP.txt
/content/drive/MyDrive/national_cybersecurity_policies_data/national_cybersecurity_strategies_v1.0/Files_Organizer/Cleaned_Non-Marker_Sequential_TXTs/20210101_Costa Rica_National Cybersecurity Strategy_ESP.txt
20210101_Costa Rica_National Cybersecurity Strategy_ESP.txt
/content/drive/MyDrive/national_cybersecurity_policies_data/national_cybersecurity_strategies_v1.0/Files_Organizer/Non-Marker_Sequential_TXTs/20210910_Panama_National Cybersecurity Strategy_ESP.txt
/content/drive/MyDrive/national_cybersecurity_policies_data/national_cybersecurity_strategies_v1.0/Files_Organizer/Cleaned_Non-Marker_Sequential_TXTs/20210910_Panama_National Cybersecurity Strategy_ESP.txt
20210910_Panama_National Cybersecurity Strategy_ESP.txt
/content/drive/MyDrive/national_cybersecurity_policies_data/national

# Alternative function clean_and_split_txt_for_sentence_analysis

In [None]:
import re
import pandas as pd

def clean_and_split_txt_for_sentence_analysis(txt_path, output_path):
    # Step 1: Read raw text
    with open(txt_path, 'r', encoding='utf-8') as f:
        raw_text = f.read()

    # Step 2: Basic cleaning
    # Remove Table of Contents and line/page numbers
    cleaned = re.sub(r'Table of Contents.*?(?=\d+\.\d+)', '', raw_text, flags=re.DOTALL)
    cleaned = re.sub(r'\.{5,}', ' ', cleaned)  # remove dotted lines
    cleaned = re.sub(r'\n+', ' ', cleaned)     # collapse multiple newlines
    cleaned = re.sub(r'\s{2,}', ' ', cleaned)  # collapse multiple spaces

    # Step 3: Fix common breaks like numbers or bullet points
    cleaned = re.sub(r'\d{1,3},\d{3}(?:\.\d+)?', '', cleaned)  # remove number amounts (e.g., 375,000.00)

    # # Step 4: Replace known abbreviations with placeholders to avoid splitting
    # abbreviations = ['U.S.']
    # for abbr in abbreviations:
    #     placeholder = f"<<{abbr.replace('.', '_')}>>"
    #     cleaned = cleaned.replace(abbr, placeholder)

    # Step 5: Split sentences based on punctuation
    sentences = re.split(r'(?<=[.!?])\s+', cleaned)

    # Optional: Remove empty or very short lines
    sentences = [s for s in sentences if len(s.strip()) > 5]

    with open(output_path, 'w', encoding='utf-8') as f:
      for sentence in sentences:
          f.write(sentence.strip() + '\n')

    print(f"Cleaned and split sentences saved to: {output_path}")

# TXT to Sentences for Text Classification Task

In [5]:
from pathlib import Path
import re
import pandas as pd

# Input/output file paths
input_path = Path('./20090101_Australia_National Cybersecurity Strategy_ENG.txt')
output_txt = Path('./[s]20090101_Australia_National Cybersecurity Strategy_ENG.txt')
output_csv = Path(str(output_txt) + '.csv')

# Read and clean the raw text
text = input_path.read_text(encoding='utf-8')

# Remove bullet points, list markers, etc.
text = re.sub(r"^\s*[\d\.\-\–•]+", "", text, flags=re.MULTILINE)

# Remove short lines (e.g., headings)
lines = [line.strip() for line in text.splitlines() if len(line.strip()) >= 5]

# Join lines into one string
text_combined = " ".join(lines)
text_combined = re.sub(r'\s+', ' ', text_combined)  # normalize whitespace

# Split into sentences
sentences = re.split(r'(?<=[.!?]) +', text_combined)

# Cleaning patterns
allowed_pattern = re.compile(r"[^A-Za-z\s.,?!:;\'\"()\-\n]")
repeated_punct_pattern = re.compile(r"[-]{2,}|[.]{2,}")

# Clean each sentence
cleaned_sentences = []
for sentence in sentences:
    cleaned = ' '.join(sentence.split())  # normalize whitespace
    cleaned = re.sub(r'(\w)-\s(\w)', r'\1\2', cleaned)  # join hyphenated words
    cleaned = re.sub(r'\s{2,}', ' ', cleaned)  # remove extra spaces
    cleaned = repeated_punct_pattern.sub('', cleaned)  # remove ---- or ....
    cleaned = allowed_pattern.sub('', cleaned)  # remove unwanted characters
    if cleaned:  # skip empty results
        cleaned_sentences.append(cleaned)

# Save as CSV
df = pd.DataFrame({'text': cleaned_sentences})
df.to_csv(output_csv, index=False)

print(f"CSV saved as: {output_csv.name}")


CSV saved as: [s]20090101_Australia_National Cybersecurity Strategy_ENG.txt.csv
