<a href="https://colab.research.google.com/github/MK316/Getpp24/blob/main/getpp_writtendata_process01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Written text processing (0811)

The output is saved as 'getpp-written.xlsx' with a log file

+ Input: getpp-written.txt [link](https://github.com/MK316/Getpp24/blob/main/data/getpp-written.txt)

In [None]:
import pandas as pd
import re

def clean_text(text):
    # Remove the initial ID from the text
    text = re.sub(r'^@@\d+\s*', '', text)

    # Replace corrupted encoding sequences
    text = re.sub(r'\*\*\d+;\d+;[^\s]+', '(brokenencoding)', text)

    # Remove content within <h> tags until the first <p> tag
    text = re.sub(r'<h>.*?<p>', '<p>', text, flags=re.DOTALL)

    # Remove all <p> tags, but keep the content
    text = re.sub(r'<\/?p>', '', text)

    # Remove sequences of '@' characters possibly with spaces
    text = re.sub(r'(@\s+)+@', ' ', text)  # Replaces sequences of '@' with a single space
    text = re.sub(r'@+', ' ', text)  # Replaces remaining '@' characters

    # Remove space before commas, periods, or any common punctuation
    text = re.sub(r'\s+(?=[,.!?;:])', '', text)

    # Correctly handle contractions
    contractions = {
        r"(\b[a-zA-Z]+) 's\b": r"\1's",
        r"(\b[a-zA-Z]+) 'nt\b": r"\1n't",
        r"(\b[a-zA-Z]+) 'm\b": r"\1'm",
        r"(\b[a-zA-Z]+) 're\b": r"\1're",
        r"(\b[a-zA-Z]+) 've\b": r"\1've",
        r"(\b[a-zA-Z]+) 'd\b": r"\1'd",
        r"(\b[a-zA-Z]+) 'll\b": r"\1'll"
    }
    for pattern, replacement in contractions.items():
        text = re.sub(pattern, replacement, text)

    # Remove spaces inside single and double quotes
    text = re.sub(r"' (\S.*?\S) '", r"'\1'", text)
    text = re.sub(r'" (\S.*?\S) "', r'"\1"', text)

    # Ensure multiple spaces are reduced to a single space
    text = re.sub(r'\s{2,}', ' ', text)

    return text.strip()

def process_file(input_path, output_path):
    data = []  # To store the results
    # Open the file and read line by line
    with open(input_path, 'r', encoding='utf-8') as file:
        for line in file:
            # Extract ID using regex
            id_match = re.search(r'@@(\d+)', line)
            if id_match:
                id = id_match.group(1)  # Capture the numeric part of the ID

                # Apply text cleaning to the line
                cleaned_text = clean_text(line.strip())

                # Append the cleaned text and ID to the data list
                data.append({'ID': id, 'Text': cleaned_text})

    # Convert list of dictionaries to DataFrame
    df = pd.DataFrame(data)

    # Save the DataFrame to a CSV file
    df.to_csv(output_path, index=False)
    print(f"Output saved to {output_path}")

# Specify the paths
input_path = 'getpp-written.txt'
output_path = 'getpp-written.csv'

# Process the file
process_file(input_path, output_path)


# [1] Getpp file information

Nchar, Nword, Nsent in separate columns

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

# Ensure that NLTK's tokenizers are downloaded
nltk.download('punkt')

def add_text_statistics(df):
    # Define functions to calculate the number of characters, words, and sentences
    def count_chars(text):
        return len(text)

    def count_words(text):
        words = word_tokenize(text)
        return len(words)

    def count_sentences(text):
        sentences = sent_tokenize(text)
        return len(sentences)

    # Apply these functions to the 'Text' column
    df['Nchar'] = df['Text'].apply(count_chars)
    df['Nword'] = df['Text'].apply(count_words)
    df['Nsent'] = df['Text'].apply(count_sentences)

    return df

# Load your existing CSV file
input_path = '/content/getpp-written00.csv'  # Adjust this path to your actual file
df = pd.read_csv(input_path)

# Add the text statistics to the DataFrame
df = add_text_statistics(df)

# Save the enhanced DataFrame to a new CSV file
output_path = '/content/getpp-written-info.csv'  # Adjust the output path if needed
df.to_csv(output_path, index=False)

print(f"Enhanced DataFrame saved to {output_path}")


In [None]:
import pandas as pd

# Load your existing CSV file
input_path = '/content/getpp-written-info.csv'  # Adjust this path to your actual file
df = pd.read_csv(input_path)

# Calculate the sums of the Nchar, Nword, and Nsent columns
total_nchar = df['Nchar'].sum()
total_nword = df['Nword'].sum()
total_nsent = df['Nsent'].sum()

# Display the results
print(f"Total Number of Characters (Nchar): {total_nchar}")
print(f"Total Number of Words (Nword): {total_nword}")
print(f"Total Number of Sentences (Nsent): {total_nsent}")


## Add average length of word and sentence

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

# Ensure that NLTK's tokenizers are downloaded
nltk.download('punkt')

def add_text_statistics(df):
    # Define functions to calculate the number of characters, words, and sentences
    def count_chars(text):
        return len(text)

    def count_words(text):
        words = word_tokenize(text)
        return len(words)

    def count_sentences(text):
        sentences = sent_tokenize(text)
        return len(sentences)

    # Apply these functions to the 'Text' column
    df['Nchar'] = df['Text'].apply(count_chars)
    df['Nword'] = df['Text'].apply(count_words)
    df['Nsent'] = df['Text'].apply(count_sentences)

    # Calculate average number of words per sentence (AVG-word)
    df['AVG-word'] = df.apply(lambda row: row['Nword'] / row['Nsent'] if row['Nsent'] > 0 else 0, axis=1)

    # Calculate average number of characters per word (AVG-sent)
    df['AVG-sent'] = df.apply(lambda row: row['Nchar'] / row['Nword'] if row['Nword'] > 0 else 0, axis=1)

    return df

# Load your existing CSV file
input_path = '/content/getpp-written-info.csv'  # Adjust this path to your actual file
df = pd.read_csv(input_path)

# Add the text statistics to the DataFrame
df = add_text_statistics(df)

# Calculate and display the sums of Nchar, Nword, and Nsent
total_nchar = df['Nchar'].sum()
total_nword = df['Nword'].sum()
total_nsent = df['Nsent'].sum()

print(f"Total Number of Characters (Nchar): {total_nchar}")
print(f"Total Number of Words (Nword): {total_nword}")
print(f"Total Number of Sentences (Nsent): {total_nsent}")

# Save the enhanced DataFrame to a new CSV file
output_path = '/content/getpp-written-info.csv'  # Adjust the output path if needed
df.to_csv(output_path, index=False)

print(f"Enhanced DataFrame saved to {output_path}")


In [None]:
df.head()

Total average word length and sentence length

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

# Ensure that NLTK's tokenizers are downloaded
nltk.download('punkt')

def add_text_statistics(df):
    # Define functions to calculate the number of characters, words, and sentences
    def count_chars(text):
        return len(text)

    def count_words(text):
        words = word_tokenize(text)
        return len(words)

    def count_sentences(text):
        sentences = sent_tokenize(text)
        return len(sentences)

    # Apply these functions to the 'Text' column
    df['Nchar'] = df['Text'].apply(count_chars)
    df['Nword'] = df['Text'].apply(count_words)
    df['Nsent'] = df['Text'].apply(count_sentences)

    return df

# Load your existing CSV file
input_path = '/content/getpp-written-info.csv'  # Adjust this path to your actual file
df = pd.read_csv(input_path)

# Add the text statistics to the DataFrame
df = add_text_statistics(df)

# Calculate the sums of Nchar, Nword, and Nsent
total_nchar = df['Nchar'].sum()
total_nword = df['Nword'].sum()
total_nsent = df['Nsent'].sum()

# Calculate the overall average word length (total Nchar / total Nword)
avg_word_length = total_nchar / total_nword if total_nword > 0 else 0

# Calculate the overall average sentence length (total Nword / total Nsent)
avg_sentence_length = total_nword / total_nsent if total_nsent > 0 else 0

# Display the results
print(f"Total Number of Characters (Nchar): {total_nchar}")
print(f"Total Number of Words (Nword): {total_nword}")
print(f"Total Number of Sentences (Nsent): {total_nsent}")
print(f"Average Word Length: {avg_word_length:.2f} characters per word")
print(f"Average Sentence Length: {avg_sentence_length:.2f} words per sentence")

# Save the enhanced DataFrame to a new CSV file
output_path = '/content/getpp-written-info.csv'  # Adjust the output path if needed
df.to_csv(output_path, index=False)

print(f"Enhanced DataFrame saved to {output_path}")
