<a href="https://colab.research.google.com/github/MK316/Getpp24/blob/main/Step01_getpp_writtendata_process.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Written text processing (0811)

The output is saved as 'getpp-written.xlsx' with a log file

+ Input: getpp-written.txt [link](https://github.com/MK316/Getpp24/blob/main/data/getpp-written.txt)

In [None]:
import pandas as pd
import re

def clean_text(text):
    # Remove the initial ID from the text
    text = re.sub(r'^@@\d+\s*', '', text)

    # Replace corrupted encoding sequences
    text = re.sub(r'\*\*\d+;\d+;[^\s]+', '(brokenencoding)', text)

    # Remove content within <h> tags until the first <p> tag
    text = re.sub(r'<h>.*?<p>', '<p>', text, flags=re.DOTALL)

    # Remove all <p> tags, but keep the content
    text = re.sub(r'<\/?p>', '', text)

    # Remove sequences of '@' characters possibly with spaces
    text = re.sub(r'(@\s+)+@', ' ', text)  # Replaces sequences of '@' with a single space
    text = re.sub(r'@+', ' ', text)  # Replaces remaining '@' characters

    # Remove space before commas, periods, or any common punctuation
    text = re.sub(r'\s+(?=[,.!?;:])', '', text)

    # Correctly handle contractions
    contractions = {
        r"(\b[a-zA-Z]+) 's\b": r"\1's",
        r"(\b[a-zA-Z]+) 'nt\b": r"\1n't",
        r"(\b[a-zA-Z]+) 'm\b": r"\1'm",
        r"(\b[a-zA-Z]+) 're\b": r"\1're",
        r"(\b[a-zA-Z]+) 've\b": r"\1've",
        r"(\b[a-zA-Z]+) 'd\b": r"\1'd",
        r"(\b[a-zA-Z]+) 'll\b": r"\1'll"
    }
    for pattern, replacement in contractions.items():
        text = re.sub(pattern, replacement, text)

    # Remove spaces inside single and double quotes
    text = re.sub(r"' (\S.*?\S) '", r"'\1'", text)
    text = re.sub(r'" (\S.*?\S) "', r'"\1"', text)

    # Ensure multiple spaces are reduced to a single space
    text = re.sub(r'\s{2,}', ' ', text)

    return text.strip()

def process_file(input_path, output_path):
    data = []  # To store the results
    # Open the file and read line by line
    with open(input_path, 'r', encoding='utf-8') as file:
        for line in file:
            # Extract ID using regex
            id_match = re.search(r'@@(\d+)', line)
            if id_match:
                id = id_match.group(1)  # Capture the numeric part of the ID

                # Apply text cleaning to the line
                cleaned_text = clean_text(line.strip())

                # Append the cleaned text and ID to the data list
                data.append({'ID': id, 'Text': cleaned_text})

    # Convert list of dictionaries to DataFrame
    df = pd.DataFrame(data)

    # Save the DataFrame to a CSV file
    df.to_csv(output_path, index=False)
    print(f"Output saved to {output_path}")

# Specify the paths
input_path = 'getpp-written.txt'
output_path = 'getpp-written.csv'

# Process the file
process_file(input_path, output_path)
