# Adding and Removing Whitespace.
Possible improvement is to use OPT-350M to predict the punctuation and newlines.

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm  # Import tqdm for progress bar

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/watermarked-and-unwatermarked-text-truncated/data_trunk.csv


In [2]:
import random

def add_whitespace(text):
    # Split text into sentences
    sentences = text.split('. ')
    modified_text = []

    for sentence in sentences:
        # Split sentence into words
        words = sentence.split(' ')
        modified_sentence = []

        for word in words:
            # Randomly choose to add space or tab after each word
            if random.random() < 0.9:
                modified_sentence.append(word + ' ' * random.randint(1, 3))  # Space with higher probability
            elif random.random() < 0.1:
                modified_sentence.append(word + '\t')  # Tab with lower probability
            else:
                modified_sentence.append(word + ' ')
        
        # Join modified words back into a sentence
        modified_text.append(''.join(modified_sentence).strip())
    
    final_text_parts = []

    for sentence in modified_text:
        # Randomly choose to add one or two new lines after each sentence
        if random.random() < 0.5:
            final_text_parts.append(sentence + '.\n')  # One new line with higher probability
        elif random.random() < 0.3:
            final_text_parts.append(sentence + '.\n\n')  # Two new lines with lower probability
        else:
            final_text_parts.append(sentence + '. ')
            
    # Join the modified sentences into final text
    final_text = ''.join(final_text_parts).strip()

    return final_text[:-1] # removes last . 

# Example usage
random.seed(42)
input_text = "This is a test sentence. Here is another one. This should add unnecessary whitespace."
output_text = add_whitespace(input_text)
print(output_text)


This is a   test   sentence. Here is another   one.
This   should add  unnecessary whitespace.


In [3]:
import re

def remove_whitespace(text):
    # Step 1: Replace multiple spaces and tabs with a single space
    text = re.sub(r'[ \t]+', ' ', text)
    
    # Step 2: Replace multiple new lines with a single new line
    text = re.sub(r'\n+', '\n', text)
    
    # Step 3: Split sentences by period followed by newline or space
    sentences = re.split(r'\. ?\n*', text)
    
    # Step 4: Strip leading/trailing whitespace from each sentence and reassemble
    sentences = [sentence.strip() for sentence in sentences if sentence]
    cleaned_text = '. '.join(sentences) + '.'
    
    return cleaned_text

# Example usage
random.seed(42)
input_text = "This is a test sentence.\nHere is another one. This should add unnecessary whitespace."
output_text = add_whitespace(input_text)
print("Text with unnecessary whitespace:")
print(output_text)
print("\nCleaned text:")
cleaned_text = remove_whitespace(output_text)
print(cleaned_text)


Text with unnecessary whitespace:
This is a   test   sentence.
Here is another one.

This should   add unnecessary  whitespace.

Cleaned text:
This is a test sentence. Here is another one. This should add unnecessary whitespace.


In [4]:
# Read data from CSV
original_data = pd.read_csv("/kaggle/input/watermarked-and-unwatermarked-text-truncated/data_trunk.csv")

# Filter for watermarked and unwatermarked samples
watermarked_data = original_data[original_data["label"] == "watermarked"][["Generated Text"]]
unwatermarked_data = original_data[original_data["label"] == "unwatermarked"][["Generated Text"]]

# Initialize empty lists to store modified and cleaned text
modified_text = []
cleaned_text = []

# Loop through the watermarked data using tqdm for progress bar
for text in tqdm(watermarked_data["Generated Text"].tolist()):
    modified = add_whitespace(text)
    cleaned = remove_whitespace(modified)
    
    modified_text.append(modified)
    cleaned_text.append(cleaned)

# Create a DataFrame for watermarked samples
watermarked_df = pd.DataFrame({
    "Original Text": watermarked_data["Generated Text"],
    "Modified Text": modified_text,
    "Cleaned Text": cleaned_text,
    "label": "watermarked"
})

# Initialize empty lists to store modified and cleaned text for unwatermarked samples
modified_text = []
cleaned_text = []

# Loop through the unwatermarked data using tqdm for progress bar
for text in tqdm(unwatermarked_data["Generated Text"].tolist()):
    cleaned = remove_whitespace(text)

    modified_text.append(text)  # Original text remains the same for unwatermarked samples
    cleaned_text.append(cleaned)

# Create a DataFrame for unwatermarked samples
unwatermarked_df = pd.DataFrame({
    "Original Text": unwatermarked_data["Generated Text"],
    "Modified Text": modified_text,
    "Cleaned Text": cleaned_text,
    "label": "unwatermarked"
})

# Merge the watermarked and unwatermarked DataFrames
merged_df = pd.concat([watermarked_df, unwatermarked_df], ignore_index=True)

# Save merged dataframe as CSV file
merged_df.to_csv("merged_data.csv", index=False)

print("Processing complete! Merged data saved to merged_data.csv")

100%|██████████| 1000/1000 [00:00<00:00, 6391.16it/s]
100%|██████████| 1000/1000 [00:00<00:00, 41111.76it/s]


Processing complete! Merged data saved to merged_data.csv


In [5]:
print(merged_df.sample(n=5))

                                         Original Text  \
568  The public will get daily updates about corona...   
475  Most butchers have their own customers to whom...   
960  The laboratory will test the quality levels of...   
595  Leaders are responsible for maintaining peace ...   
680  People should take caution before borrowing mo...   

                                         Modified Text  \
568  The public will   get daily updates   about co...   
475  Most   butchers have  their\town  customers  t...   
960  The   laboratory   will test  the  quality   l...   
595  Leaders are responsible  for   maintaining  pe...   
680  People  should  take   caution before borrowin...   

                                          Cleaned Text        label  
568  The public will get daily updates about corona...  watermarked  
475  Most butchers have their own customers to whom...  watermarked  
960  The laboratory will test the quality levels of...  watermarked  
595  Leaders are respo