In [2]:
import re
import pandas as pd

# Filepath to your .txt file
file_path = "gaza_debate_data.txt"


# Read the .txt file into a string
with open(file_path, 'r') as file:
    raw_data = file.read()

# Split on lines
lines = raw_data.strip().split('\n')

data = []
# Dictionary to keep track of turn indices dynamically for any speaker
turn_index = {}

# Regex pattern to identify lines like "Speaker A:"
speaker_pattern = re.compile(r"^(Speaker\s+([A-Z]):)\s*(.*)$", re.IGNORECASE)

current_speaker = None
current_text = []

for line in lines:
    match = speaker_pattern.match(line)
    if match:
        # If we find a speaker line, first store the previous utterance if it exists
        if current_speaker is not None:
            # Store the previous utterance
            speaker_char = current_speaker.upper()  # Ensure consistent labeling
            turn_index[speaker_char] = turn_index.get(speaker_char, 0) + 1
            data.append({
                'speaker': speaker_char,
                'turn_index': turn_index[speaker_char],
                'text': ' '.join(current_text).strip()
            })

        # Start a new utterance
        current_speaker = match.group(2)  # This will capture any speaker label
        first_line_text = match.group(3)
        current_text = [first_line_text]
    else:
        # Continuation of the same speaker's utterance
        current_text.append(line.strip())

# Don't forget the last one
if current_speaker is not None and current_text:
    speaker_char = current_speaker.upper()
    turn_index[speaker_char] = turn_index.get(speaker_char, 0) + 1
    data.append({
        'speaker': speaker_char,
        'turn_index': turn_index[speaker_char],
        'text': ' '.join(current_text).strip()
    })

# Convert to a DataFrame
df = pd.DataFrame(data)

# Display or save the DataFrame
print(df)
output_file = "processed_dialogue.csv"
df.to_csv(output_file, index=False)
print(f"Processed data saved to {output_file}")


    speaker  turn_index                                               text
0         A           1  Welcome to Open to Debate. I'm John Donvan and...
1         B           1                           Thank you for having me.
2         A           2  And gentlemen, thanks each of you for being he...
3         C           1                         Glad to take this offline.
4         B           2                                         Thank you.
..      ...         ...                                                ...
246       B          82  It's a great question, John, because you work ...
247       A          80                                                 I.
248       E           4         Political reporter for the New York Times.
249       B          83  You were at one time a media columnist. You sh...
250       A          81  All right, all right, all right, all right. I ...

[251 rows x 3 columns]
Processed data saved to processed_dialogue.csv
