In [None]:
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.ticker import PercentFormatter

# Set of punctuation marks to detect line endings
punctuation_marks = set('.;:""()!?—,')

# Load the tagged XML
with open('/content/Both Volumes tagged_normalized.txt', 'r', encoding='utf-8') as file:
    text = file.read()

soup = BeautifulSoup(text, 'html.parser')

# Store date and punctuation ratio for each poem
poem_dates = []
punctuation_ratios = []

# Parse each poem
for poem in soup.find_all('poem'):
    date_tag = poem.find('date')
    try:
        year = int(date_tag.text.strip()) if date_tag else None
    except ValueError:
        continue
    if year is None or not (1800 <= year <= 1900):
        continue

    line_tags = poem.find_all('line')
    total_lines = len(line_tags)
    if total_lines == 0:
        continue

    lines_without_punc = sum(1 for line in line_tags if line.text.strip()[-1:] not in punctuation_marks)
    ratio = lines_without_punc / total_lines

    poem_dates.append(year)
    punctuation_ratios.append(ratio)

# Convert to NumPy arrays
x = np.array(poem_dates)
y = np.array(punctuation_ratios)

# Fit a 8th-degree polynomial
degree = 8
coeffs = np.polyfit(x, y, degree)
poly_fit = np.poly1d(coeffs)

# Fit a linear polynomial (degree=1)
linear_coeffs = np.polyfit(x, y, 1)
linear_fit = np.poly1d(linear_coeffs)

# Generate smooth curve for plotting polynomial fit
x_smooth = np.linspace(1780, 1900, 500)
y_smooth = poly_fit(x_smooth)

# Generate values for linear fit over the same range
y_linear = linear_fit(x_smooth)

# Plotting
plt.figure(figsize=(12, 6))
plt.scatter(x, y, color='skyblue', alpha=0.4, label='Individual Poems')
plt.plot(x_smooth, y_smooth, color='darkorange', linewidth=2, label=f'Polynomial Fit (deg {degree})')
plt.plot(x_smooth, y_linear, color='green', linewidth=2, linestyle='--', label='Linear Fit')

plt.title("Proportion of Un-Punctuated Line-Endings by Poem (1800–1900)")
plt.xlabel("Year")
plt.ylabel("Proportion of Lines Ending in Punctuation")
plt.ylim(0.1, 0.5)  # Only show the 0.1 to 0.5 range
plt.gca().yaxis.set_major_formatter(PercentFormatter(1.0))
plt.grid(True, linestyle='--', alpha=0.3)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import numpy as np
import spacy
from matplotlib.ticker import PercentFormatter

# Load spaCy English model
import en_core_web_sm
nlp = en_core_web_sm.load()

# Set of punctuation marks to detect line endings
punctuation_marks = set('.;:"()!?—,')

# Load the tagged XML
with open('/content/Both Volumes tagged_normalized.txt', 'r', encoding='utf-8') as file:
    text = file.read()

soup = BeautifulSoup(text, 'html.parser')

# Store date and verb-ending ratio for each poem
poem_dates = []
verb_ending_ratios = []

# Parse each poem
for poem in soup.find_all('poem'):
    date_tag = poem.find('date')
    try:
        year = int(date_tag.text.strip()) if date_tag else None
    except ValueError:
        continue
    if year is None or not (1800 <= year <= 1900):
        continue

    line_tags = poem.find_all('line')
    if not line_tags:
        continue

    total_valid = 0
    verb_ending_count = 0

    for line in line_tags:
        line_text = line.text.strip()
        if not line_text:
            continue

        # Only consider lines NOT ending in punctuation
        if line_text[-1] in punctuation_marks:
            continue

        # Parse with spaCy and check if last token is a verb
        doc = nlp(line_text)
        tokens = [token for token in doc if not token.is_space]
        if not tokens:
            continue

        last_token = tokens[-1]
        total_valid += 1
        if last_token.pos_ == "VERB":
            verb_ending_count += 1

    if total_valid == 0:
        continue

    ratio = verb_ending_count / total_valid
    poem_dates.append(year)
    verb_ending_ratios.append(ratio)

# Convert to NumPy arrays
x = np.array(poem_dates)
y = np.array(verb_ending_ratios)

# Fit polynomial curve (degree 8)
degree = 8
coeffs = np.polyfit(x, y, degree)
poly_fit = np.poly1d(coeffs)

# Fit linear regression (degree=1)
linear_coeffs = np.polyfit(x, y, 1)
linear_fit = np.poly1d(linear_coeffs)

# Generate smooth curve for plotting
x_smooth = np.linspace(1800, 1900, 500)
y_smooth = poly_fit(x_smooth)
y_linear = linear_fit(x_smooth)

# Plotting
plt.figure(figsize=(12, 6))
plt.scatter(x, y, color='skyblue', alpha=0.4, label='Individual Poems')
plt.plot(x_smooth, y_smooth, color='darkorange', linewidth=2, label=f'Polynomial Fit (deg {degree})')
plt.plot(x_smooth, y_linear, color='green', linewidth=2, linestyle='--', label='Linear Fit')

plt.title("Proportion of Un-Punctuated Lines Ending in Verbs by Poem (1800–1900)")
plt.xlabel("Year")
plt.ylabel("Proportion of Verb-Endings")
plt.ylim(0, 0.4)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1.0))
plt.grid(True, linestyle='--', alpha=0.3)
plt.legend()
plt.tight_layout()
plt.show()
