<a href="https://colab.research.google.com/github/MK316/Getpp24/blob/main/wdata_0812.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# wdata process (7PM, 0812)

## [1] Step 01: text file to csv

+ Cautions to make: two files are too big and they will be split into two files: **1471241a, 1471241b; 8671240a, 8671240b**
+ input file: getpp-written.txt
+ output file: wdata00.csv (with ID, Text columns)
+ 2916-2 files (2914 files)

In [None]:
import pandas as pd
import re
import nltk

# Ensure that the necessary NLTK resources are downloaded
nltk.download('punkt')

# File path for the input and output files
input_file = '/content/getpp-written.txt'  # Replace with your actual file path
output_file = '/content/wdata00.csv'

# Initialize a list to store the processed data
data = []

# Read the text file
with open(input_file, 'r', encoding='utf-8') as file:
    lines = file.readlines()

# Process each line
for line in lines:
    # Remove leading '@' symbols and split into ID and Text
    line = line.strip()
    id_text_split = re.split(r'\s+', line, maxsplit=1)

    if len(id_text_split) < 2:
        continue  # Skip if the line doesn't have both ID and Text parts

    ID = id_text_split[0].replace('@', '')
    text = id_text_split[1]

    # Check for specific IDs that require splitting
    if ID in ['1471241', '8671240']:
        # Split the text into sentences
        sentences = nltk.sent_tokenize(text)
        mid_point = len(sentences) // 2

        # Join the sentences to form two parts
        text_part1 = ' '.join(sentences[:mid_point])
        text_part2 = ' '.join(sentences[mid_point:])

        # Append both parts with 'a' and 'b' appended to the ID
        data.append([f'{ID}a', text_part1])
        data.append([f'{ID}b', text_part2])
    else:
        # Append the normal ID and Text
        data.append([ID, text])

# Convert the list to a DataFrame
df = pd.DataFrame(data, columns=['ID', 'Text'])

# Save the DataFrame to a CSV file
df.to_csv(output_file, index=False, encoding='utf-8')

print(f"Processing complete. Data saved to {output_file}.")


## [2] Tag to remove

In [None]:
import pandas as pd
import re
import nltk

# Ensure that the necessary NLTK resources are downloaded
nltk.download('punkt')

# Load the existing CSV file
file_path = '/content/wdata00.csv'  # Update this to the path of your actual file
df = pd.read_csv(file_path, encoding='utf-8')

# Function to remove <h> and <p> tags and count words
def process_text(text):
    # Remove <h> and <p> tags
    clean_text = re.sub(r'<\/?[hp]>', '', text)

    # Tokenize text to count words, excluding @ symbols and symbol-only strings
    tokens = nltk.word_tokenize(clean_text)
    word_count = sum(1 for token in tokens if token.isalnum())

    return clean_text, word_count

# Apply processing to each row
df['Text'], df['Nword'] = zip(*df['Text'].apply(process_text))

# Save the modified DataFrame back to the CSV file
output_path = '/content/wdata01.csv'
df.to_csv(output_path, index=False, encoding='utf-8')

print(f"Processing complete. Data saved to {file_path}.")


## [3] White space to remove before punctuations

Step 1: Remove the space before punctuation and save the cleaned text in a new column text

In [None]:
import pandas as pd
import re

# Load the existing CSV file
file_path = '/content/wdata01.csv'  # Update this to the path of your actual file
df = pd.read_csv(file_path, encoding='utf-8')

# Function to remove spaces before punctuation
def remove_space_before_punctuation(text):
    # Remove space before any punctuation
    cleaned_text = re.sub(r'\s+([?.!",:;])', r'\1', text)
    return cleaned_text

# Apply the function to remove spaces before punctuation and create a new column 'text'
df['text'] = df['Text'].apply(remove_space_before_punctuation)

# Save the DataFrame with the new 'text' column
df.to_csv('/content/wdata02_step1.csv', index=False, encoding='utf-8')

print("Step 1 complete. Data saved with cleaned text to 'wdata02_step1.csv'.")


Step 2: Split the text by sentences, count the number of sentences, and filter out sentences that are only symbols/punctuation

In [None]:
import nltk
nltk.download('punkt')

# Load the CSV file with the cleaned text
df = pd.read_csv('/content/wdata02_step1.csv', encoding='utf-8')

# Function to split text into sentences, filter out non-word sentences, and count them
def split_and_count_sentences(text):
    # Split text into sentences
    sentences = nltk.sent_tokenize(text)

    # Filter out sentences that contain only symbols or punctuation
    valid_sentences = [sent for sent in sentences if any(word.isalnum() for word in nltk.word_tokenize(sent))]

    # Count the number of valid sentences
    sentence_count = len(valid_sentences)

    return valid_sentences, sentence_count

# Apply the function to split and count sentences, and save results in 'Sentences' and 'Nsent' columns
df['Sentences'], df['Nsent'] = zip(*df['text'].apply(split_and_count_sentences))

# Save the DataFrame with the new 'Sentences' and 'Nsent' columns
df.to_csv('/content/wdata02.csv', index=False, encoding='utf-8')

print("Step 2 complete. Data saved with sentences and sentence count to 'wdata02.csv'.")


## [4] Passive sentences: count and list

+ input: wdata02.csv
+ output: wdata03.csv

In [None]:
print(f"Number of rows: {len(df)}")
print(f"Total number of sentences: {df['Sentences'].apply(len).sum()}")


In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import wordnet as wn

# Ensure that the necessary NLTK resources are downloaded
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Load the dataset
file_path = '/content/wdata02.csv'  # Update this to the path of your actual file
df = pd.read_csv(file_path, on_bad_lines='skip', encoding='utf-8')

# Function to detect past participles
def is_past_participle(word):
    pos = pos_tag([word])[0][1]
    return pos == 'VBN'

# Function to detect passive voice with 'be' verb
def contains_bepp(sentence):
    tokens = word_tokenize(sentence)
    tagged = pos_tag(tokens)
    for i in range(len(tagged) - 1):
        if tagged[i][0].lower() in ['is', 'are', 'was', 'were', 'be', 'been', 'being'] and is_past_participle(tagged[i + 1][0]):
            return True
    return False

# Function to detect passive voice with 'get' verb
def contains_getpp(sentence):
    tokens = word_tokenize(sentence)
    tagged = pos_tag(tokens)
    for i in range(len(tagged) - 1):
        if tagged[i][0].lower() == 'get' and is_past_participle(tagged[i + 1][0]):
            return True
    return False

# Function to find passives in a list of sentences
def find_passives(sentences):
    passive_sentences = []
    bepp_count = 0
    getpp_count = 0

    for sentence in sentences:
        if contains_bepp(sentence):
            bepp_count += 1
            passive_sentences.append(sentence)
        elif contains_getpp(sentence):
            getpp_count += 1
            passive_sentences.append(sentence)

    return passive_sentences, bepp_count, getpp_count

# Process in batches
batch_size = 100  # You can adjust the batch size according to your system's capacity

# Initialize a file for storing results
output_path = '/content/wdata03.csv'
header = True  # To include the header only once

for start in range(0, len(df), batch_size):
    end = start + batch_size
    df_batch = df.iloc[start:end].copy()  # Create a copy of the batch to work on

    # Process the batch
    df_batch['Passives'], df_batch['Bepp'], df_batch['Getpp'] = zip(*df_batch['Sentences'].apply(lambda x: find_passives(eval(x))))

    # Append the processed batch to the output file
    df_batch.to_csv(output_path, mode='a', header=header, index=False, encoding='utf-8')

    # After the first write, subsequent writes should not include the header
    header = False

    print(f"Processed rows {start} to {end}")

print("Processing complete and saved to 'wdata03.csv'.")


## [5] Results: Getting bepp and getpp list and counts.

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import wordnet as wn

# Ensure that the necessary NLTK resources are downloaded
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Load the dataset
file_path = '/content/wdata02.csv'  # Update this to the path of your actual file
df = pd.read_csv(file_path, on_bad_lines='skip', encoding='utf-8')

# Function to detect past participles
def is_past_participle(word):
    pos = pos_tag([word])[0][1]
    return pos == 'VBN'

# Function to detect passive voice with 'be' verb
def contains_bepp(sentence):
    tokens = word_tokenize(sentence)
    tagged = pos_tag(tokens)
    for i in range(len(tagged) - 1):
        if tagged[i][0].lower() in ['is', 'are', 'was', 'were', 'be', 'been', 'being'] and is_past_participle(tagged[i + 1][0]):
            return True
    return False

# Function to detect passive voice with 'get' verb
def contains_getpp(sentence):
    tokens = word_tokenize(sentence)
    tagged = pos_tag(tokens)
    for i in range(len(tagged) - 1):
        if tagged[i][0].lower() == 'get' and is_past_participle(tagged[i + 1][0]):
            return True
    return False

# Function to find passives in a list of sentences and separate them
def find_passives(sentences):
    be_passive_sentences = []
    get_passive_sentences = []
    bepp_count = 0
    getpp_count = 0

    for sentence in sentences:
        if contains_bepp(sentence):
            bepp_count += 1
            be_passive_sentences.append(sentence)
        elif contains_getpp(sentence):
            getpp_count += 1
            get_passive_sentences.append(sentence)

    return be_passive_sentences, get_passive_sentences, bepp_count, getpp_count

# Process in batches
batch_size = 100  # You can adjust the batch size according to your system's capacity

# Initialize a file for storing results
output_path = '/content/passive-result-w01.csv'
header = True  # To include the header only once

for start in range(0, len(df), batch_size):
    end = start + batch_size
    df_batch = df.iloc[start:end].copy()  # Create a copy of the batch to work on

    # Process the batch
    df_batch['BePassive'], df_batch['GetPassive'], df_batch['Bepp'], df_batch['Getpp'] = zip(*df_batch['Sentences'].apply(lambda x: find_passives(eval(x))))

    # Append the processed batch to the output file
    df_batch.to_csv(output_path, mode='a', header=header, index=False, encoding='utf-8')

    # After the first write, subsequent writes should not include the header
    header = False

    print(f"Processed rows {start} to {end}")

print("Processing complete and saved to 'passive-result-w01.csv'.")


## [6] Descriptive statistics

In [None]:
import pandas as pd

# Load the processed dataset
file_path = '/content/passive-result-w01.csv'  # Update this to the path of your actual file
df = pd.read_csv(file_path, encoding='utf-8')

# Calculate the total counts for Bepp and Getpp
total_bepp = df['Bepp'].sum()
total_getpp = df['Getpp'].sum()

# Generate a summary DataFrame
summary_df = pd.DataFrame({
    'Type': ['Bepp', 'Getpp'],
    'Total Count': [total_bepp, total_getpp]
})

# Display the summary
print(summary_df)

# Optionally, save the summary to a new CSV file
summary_output_path = '/content/passive_summary.csv'  # Update this to the desired output file path
summary_df.to_csv(summary_output_path, index=False, encoding='utf-8')

print(f"Summary saved to '{summary_output_path}'")


# Part II. Data analysis