# Creating synthetic corruption

There doesn't seem to be enough examples of corrupted data to train models

- part 1 learns the transmission distributions for the corruption model
- part 2 applies the corruption to text
- part 3 creating a dataset

In [173]:
import sys
import os
import pandas as pd
from tqdm import tqdm
from corruption import *
from tqdm import tqdm
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Meta-Llama-3-8B')



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
data = pd.read_csv('./data/aligned/aligned_BLN600.csv')

In [4]:
gt_aligned_list = data['gt_aligned'].to_list()

noise_aligned_list = data['noise_aligned'].to_list()


In [5]:
# Example aligned text pairs
aligned_texts = [
    ("New Yo@rk is big", "Nev Yo rk@is@@@"),
    ("New Yo@rk is big", "New Yo rk@is@@@@"),
    # Add more aligned text pairs here
]

aligned_texts = list(zip(gt_aligned_list, noise_aligned_list))

# Initialize counters
deletion_counts, insertion_counts, substitution_counts, character_counts = initialize_counters()

# Update counts for all aligned text pairs
for gt, noise in aligned_texts:
    update_counts(gt, noise, deletion_counts, insertion_counts, substitution_counts, character_counts)

# Calculate character distribution
character_distribution = calculate_character_distribution(character_counts)

# Calculate conditional probabilities
conditional_probs = calculate_conditional_probs(deletion_counts, insertion_counts, substitution_counts, character_counts)

# Generate substitution and insertion tables
substitution_table, insertion_table = generate_substitution_insertion_tables(substitution_counts, insertion_counts, character_counts)

# Add default values to tables
conditional_probs, substitution_table, insertion_table = add_default_values(conditional_probs, substitution_table, insertion_table, character_distribution)

In [6]:
print(deletion_counts['a'])
print(sum(substitution_counts['a'].values()))
print(sum(insertion_counts['a'].values()))
print(character_counts['a'])

1076
2320
1821
103661


In [7]:
char = 'a'

In [8]:
total_count2 = character_counts[char]

# Calculate individual probabilities for this character
delete_prob2 = deletion_counts[char] / total_count2 if char in deletion_counts else 0

In [9]:
deletion_counts['a']

1076

In [10]:
calculate_joint_probabilities(conditional_probs, character_distribution)

{'correct': 0.9313472083726387,
 'substitute': 0.02592342906843265,
 'delete': 0.015011386401281067,
 'insert': 0.029770339845345932}

In [189]:
conditional_probs2 = modify_and_renormalize_probs(conditional_probs, column = 'correct', factor = .9)

calculate_joint_probabilities(conditional_probs2, character_distribution)

{'correct': 0.8382124875353749,
 'substitute': 0.05781538406622265,
 'delete': 0.03309549012525023,
 'insert': 0.07087663827315206}

# part 3

In [184]:
import re
import pandas as pd



def load_encyc_brit(file_path):
    """
    Reads a text file and splits it into sections based on at least three line breaks.
    Extracts the title from each section based on the text in all caps before the first comma
    or any text in brackets. Returns a DataFrame where each section is a row with 'Title' and 'Section' columns.
    
    :param file_path: Path to the text file
    :return: DataFrame with titles and sections as rows
    """
    # Step 1: Read the text file
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()

    text = re.sub(r'\n{3}PLATE.*?\]\n{3}', '', text, flags=re.DOTALL)
    #remove illustrations and the surrounding line breaks
    text = re.sub(r'\n*\[Illustration:.*?\]\n*', '', text, flags=re.DOTALL)
    #Deal with subsections by removing 
    text = re.sub(r'\n{1,}(\s{2,}.*\n)', r'\n\1', text)

    # Step 2: Split the text into sections based on at least three line breaks
    sections = re.split(r'\n{3,}', text)

    # Step 3: Extract titles and create DataFrame rows
    data = []
    for section in sections:
        # Optional: Strip leading/trailing whitespace
        section = section.strip()
        if not section:
            continue
        
        # Step 3a: Extract the title (text in all caps before the first comma or bracketed text)
        match = re.match(r'^([A-Z\s]+?)(?:\s*\([^)]*\))?(?:,|$)', section)
        if match:
            title = match.group(1).strip()
        else:
            title = None  # If no title is found, set it to None or an empty string

        # Step 3b: Store the section and title
        data.append({'title': title, 'content': section})

    # Step 4: Create a DataFrame
    df = pd.DataFrame(data)

    return df




In [80]:
all_vols = os.listdir('./data/encyclopedia')

In [81]:
all_vols

['pg41264.txt',
 'pg39908.txt',
 'pg34612.txt',
 'pg34018.txt',
 'pg37064.txt',
 'pg32758.txt',
 'pg39232.txt',
 'pg38454.txt',
 'pg37282.txt',
 'pg37880.txt',
 'pg39775.txt',
 'pg40641.txt',
 'pg37610.txt',
 'pg43427.txt',
 'pg31793.txt',
 'pg19699.txt',
 'pg38892.txt',
 'pg31447.txt',
 'pg40956.txt',
 'pg35092.txt',
 'pg34162.txt',
 'pg34878.txt',
 'pg35925.txt',
 'pg40370.txt',
 'pg33127.txt',
 'pg38622.txt',
 'pg30073.txt',
 'pg39521.txt',
 'pg36104.txt',
 'pg35398.txt',
 'pg33698.txt',
 'pg34992.txt',
 'pg32940.txt',
 'pg32097.txt',
 'pg42342.txt',
 'pg34702.txt',
 'pg32860.txt',
 'pg40156.txt',
 'pg38143.txt',
 'pg35236.txt',
 'pg39029.txt',
 'pg40009.txt',
 'pg39127.txt',
 'pg38964.txt',
 'pg41343.txt',
 'pg41567.txt',
 'pg39632.txt',
 'pg32783.txt',
 'pg33052.txt',
 'pg37806.txt',
 'pg39700.txt',
 'pg39353.txt',
 'pg37160.txt',
 'pg31641.txt',
 'pg31329.txt',
 'pg36735.txt',
 'pg41472.txt',
 'pg30935.txt',
 'pg33614.txt',
 'pg34312.txt',
 'pg33239.txt',
 'pg35606.txt',
 'pg4273

In [116]:
all_vols[60]

'pg33239.txt'

In [172]:
# Example usage:
file_path =os.path.join('data/encyclopedia',all_vols[60]) #'data/encyclopedia/pg31447.txt' 
df = load_encyc_brit(file_path)
df['content'] = df['content'].str.replace("_", "")
df['tokens'] = df['content'].apply(lambda text: len(tokenizer.encode(text)))
start_row = (df['content'].str.contains("ARTICLES IN THIS SLICE", na=False).idxmax()+1)
end_row = (df['content'].str.contains("END OF THE PROJECT GUTENBERG EBOOK", na=False).idxmax()-1)

df = df.loc[start_row:end_row, :]
df#.head(20)

Unnamed: 0,title,content,tokens
5,CAT,"CAT,[1] properly the name of the well-known do...",4255
6,,"FOOTNOTE:\n\n [1] The word ""cat"" is applied t...",607
7,CATABOLISM,"CATABOLISM, or KATABOLISM (Gr. [Greek: kata], ...",62
8,CATACLYSM,"CATACLYSM (Gr. [Greek: kataklusmos], a deluge)...",80
9,CATACOMB,"CATACOMB, a subterranean excavation for the in...",14780
...,...,...,...
208,CELT,"CELT, or KELT, the generic name of an ancient ...",3176
209,,CELTIC LANGUAGES\n\nIntroduction.--The Celtic ...,29337
210,,CELTIC LITERATURE\n\n Ogam inscriptions.\n\nI...,68642
211,,FOOTNOTES:\n [1] J. Loth gives it as his opin...,144


In [185]:
encyc_df = []

# Loop through each file in the list with tqdm for progress monitoring
for file_path in tqdm(all_vols, desc="Processing Files"):
    # Construct the full file path
    full_path = os.path.join('data/encyclopedia', file_path)
    
    # Load the data into a DataFrame
    df = load_encyc_brit(full_path)
    
    # Replace underscores in the content
    df['content'] = df['content'].str.replace("_", "")
    
    # Calculate the number of tokens for each row
    df['tokens'] = df['content'].apply(lambda text: len(tokenizer.encode(text)))

    df['file'] = file_path
    
    # Find the start and end rows
    start_row = df['content'].str.contains("ARTICLES IN THIS SLICE", na=False).idxmax() + 1
    end_row = df['content'].str.contains("END OF THE PROJECT GUTENBERG EBOOK", na=False).idxmax() - 1
    
    # Slice the DataFrame to include only relevant rows
    df = df.iloc[start_row:end_row]
    
    # Append the processed DataFrame to the list
    encyc_df.append(df)

# Concatenate all DataFrames into a single DataFrame
encyc_df = pd.concat(encyc_df, ignore_index=True)


Processing Files: 100%|██████████| 127/127 [01:36<00:00,  1.31it/s]


In [186]:
encyc_df#['tokens'].sum()/1e6

Unnamed: 0,title,content,tokens,file
0,JACOBITES,"JACOBITES (from Lat. Jacobus, James), the name...",1673,pg41264.txt
1,JACOBS,"JACOBS, CHRISTIAN FRIEDRICH WILHELM (1764-1847...",450,pg41264.txt
2,JACOBS CAVERN,"JACOBS CAVERN, a cavern in latitude 36° 35´ N....",859,pg41264.txt
3,JACOBSEN,"JACOBSEN, JENS PETER (1847-1885), Danish imagi...",657,pg41264.txt
4,,"JACOB'S WELL, the scene of the conversation be...",333,pg41264.txt
...,...,...,...,...
21881,COSTUME,"COSTUME (through the Fr. costume, from Ital. c...",2556,pg32182.txt
21882,,I. ANCIENT COSTUME\n\ni. Ancient Oriental.--Al...,18173,pg32182.txt
21883,,II. COSTUME IN MEDIEVAL AND MODERN EUROPE\n\ni...,16034,pg32182.txt
21884,,"III. NATIONAL AND CLASS COSTUME\n\nCostume, as...",1865,pg32182.txt


In [183]:
encyc_df.loc[encyc_df['tokens']<100,'tokens'].sum()

np.int64(107274)

In [75]:
df.loc[~df['title'].isnull() & (df.index>7), 'tokens'].sum()*126

np.int64(10604916)

In [53]:
import re
import pandas as pd

def text_to_dataframe_with_titles(file_path):
    """
    Reads a text file and splits it into sections based on double line breaks and titles.
    Returns a DataFrame with each section's title and content.
    
    :param file_path: Path to the text file
    :return: DataFrame with titles and content
    """
    # Step 1: Read the text file
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()

    # Step 2: Split the text into sections based on double line breaks
    sections = text.split('\n\n\n')
    
    # Step 3: Process each section to extract title and content
    data = []
    for section in sections:
        # Use regex to find the title which is enclosed in underscores
        match = re.search(r'_(.+)_\n\n', section)
        if match:
            title = match.group(1).strip()
            content = section.replace(match.group(0), '').strip()  # Remove the title from content
        else:
            title = None
            content = section.strip()

        if content:  # Ensure the content is not empty
            data.append({'title': title, 'content': content})

    # Step 4: Create a DataFrame
    df = pd.DataFrame(data)

    return df

# Example usage:
file_path = 'data/knowledge_for_the_time.txt' 
df = text_to_dataframe_with_titles(file_path)
df['content'] = df['content'].str.replace("_", "")
df['tokens'] = df['content'].apply(lambda text: len(tokenizer.encode(text)))



In [54]:
df.loc[~df['title'].isnull()]

Unnamed: 0,title,content,tokens
22,Politics not yet a Science.,"Mr. Buckle, in his thoughtful History of Civil...",244
23,The Philosopher and the Historian.,"“I have read somewhere or other,” says Lord Bo...",575
24,Whig and Tory Ministries.,The domestic history of England during the rei...,955
25,Protectionists.,This name was given to that section of the Con...,123
26,"Rats, and Ratting.","James, in his Military Dictionary, 1816, state...",176
...,...,...,...
363,The Book of Job.,Diversified are the opinions of the most learn...,1183
365,Great Precedence Question.,The great question relative to precedence whic...,1367
378,London Review.,CRITICAL OPINIONS OF THE ABOVE WORK.\n\n“Anoth...,685
450,Dr. Falck Lebahn’s Popular Series of German Sc...,‘As an educational writer in the German tongue...,68


In [55]:
df.loc[~df['title'].isnull(), 'tokens'].sum()

np.int64(170945)