In [2]:
import pandas as pd
import numpy as np

# Load the new master dataset
df = pd.read_csv('master_dataset_final.csv')

print("="*100)
print(" "*30 + "MASTER DATASET EXPLORATION")
print("="*100)

print(f"\nTotal number of records: {len(df)}")
print(f"Total number of columns: {len(df.columns)}")

print("\n" + "="*100)
print("COLUMN NAMES AND TYPES")
print("="*100)
print(df.dtypes)

print("\n" + "="*100)
print("FIRST 5 ROWS")
print("="*100)
print(df.head())

print("\n" + "="*100)
print("KEY COLUMN VALUE COUNTS")
print("="*100)

# Check all object columns for their unique values
for col in df.columns:
    if df[col].dtype == 'object':
        unique_count = df[col].nunique()
        if unique_count < 100:  # Show columns with reasonable number of unique values
            print(f"\n{col} (unique values: {unique_count}):")
            print(df[col].value_counts().head(15))
        else:
            print(f"\n{col}: {unique_count} unique values (showing top 10):")
            print(df[col].value_counts().head(10))

print("\n" + "="*100)
print("MISSING VALUES")
print("="*100)
print(df.isnull().sum())

print("\n" + "="*100)
print("CHECKING FOR URL/LINK COLUMNS")
print("="*100)
url_cols = [col for col in df.columns if 'url' in col.lower() or 'link' in col.lower() or 'href' in col.lower()]
print(f"URL-related columns found: {url_cols}")
if url_cols:
    for col in url_cols:
        print(f"\nSample {col}:")
        print(df[col].head(3).to_string())

print("\n" + "="*100)
print("LANGUAGE DISTRIBUTION (if available)")
print("="*100)
lang_cols = [col for col in df.columns if 'lang' in col.lower()]
if lang_cols:
    for col in lang_cols:
        print(f"\n{col}:")
        print(df[col].value_counts())
else:
    print("No explicit language column found. Will need to check content or other indicators.")

print("\n" + "="*100)
print("CHECKING FOR DISCOURSE TYPE / CATEGORY")
print("="*100)
type_cols = [col for col in df.columns if 'type' in col.lower() or 'category' in col.lower() or 'genre' in col.lower()]
if type_cols:
    for col in type_cols:
        print(f"\n{col}:")
        print(df[col].value_counts())

print("\n" + "="*100)
print("SUMMARY")
print("="*100)
print(f"Total records: {len(df)}")
print(f"Columns available: {list(df.columns)}")
print("\nNext steps:")
print("1. Identify which column contains the URLs for scraping")
print("2. Filter for letters (both English and Bengali)")
print("3. Scrape HTML content from URLs")
print("4. Clean and extract text content")

                              MASTER DATASET EXPLORATION

Total number of records: 1466
Total number of columns: 19

COLUMN NAMES AND TYPES
Index            object
volume          float64
Name             object
Category         object
Type             object
Date             object
Place            object
Source           object
Audience         object
Language         object
section          object
genre            object
href             object
full_url         object
filename         object
has_html           bool
has_date           bool
has_place          bool
has_audience       bool
dtype: object

FIRST 5 ROWS
  Index  volume                                   Name  \
0     1     1.0                           Introduction   
1     2     1.0                    Response to Welcome   
2     3     1.0                        Why We Disagree   
3     4     1.0                      Paper on Hinduism   
4     5     1.0  Religion not the Crying Need of India   

                           

In [3]:
# Filter for all epistles (letters)
letters_df = df[df['genre'] == 'Letters'].copy()

print("="*100)
print(" "*30 + "LETTERS DATASET - INITIAL FILTERING")
print("="*100)

print(f"\nTotal letters extracted: {len(letters_df)}")
print(f"Original dataset size: {len(df)}")
print(f"Percentage of letters: {len(letters_df)/len(df)*100:.1f}%")

print("\n" + "="*100)
print("CHECKING FOR DUPLICATES (by full_url)")
print("="*100)

# Check for duplicates
duplicate_count = letters_df['full_url'].duplicated().sum()
print(f"Number of duplicate URLs: {duplicate_count}")

if duplicate_count > 0:
    print(f"\nDuplicate URLs found:")
    duplicate_urls = letters_df[letters_df['full_url'].duplicated(keep=False)].sort_values('full_url')
    print(duplicate_urls[['Index', 'Name', 'Audience', 'Date', 'Language', 'full_url']].to_string())

    # Remove duplicates, keeping first occurrence
    letters_df_deduped = letters_df.drop_duplicates(subset='full_url', keep='first')
    print(f"\nâœ“ Removed {duplicate_count} duplicates. New count: {len(letters_df_deduped)}")
else:
    letters_df_deduped = letters_df.copy()
    print("âœ“ No duplicates found!")

print("\n" + "="*100)
print("LANGUAGE DISTRIBUTION")
print("="*100)
print(letters_df_deduped['Language'].value_counts())
print(f"\nTotal languages: {letters_df_deduped['Language'].nunique()}")

print("\n" + "="*100)
print("TYPE DISTRIBUTION")
print("="*100)
print(letters_df_deduped['Type'].value_counts())

print("\n" + "="*100)
print("CATEGORY DISTRIBUTION")
print("="*100)
print(letters_df_deduped['Category'].value_counts())

print("\n" + "="*100)
print("SECTION DISTRIBUTION")
print("="*100)
print(letters_df_deduped['section'].value_counts())

print("\n" + "="*100)
print("MISSING VALUES IN LETTERS DATASET")
print("="*100)
print(letters_df_deduped.isnull().sum())

print("\n" + "="*100)
print("URL STATUS CHECK")
print("="*100)
missing_urls = letters_df_deduped['full_url'].isnull().sum()
print(f"Letters with missing URLs: {missing_urls}")
if missing_urls > 0:
    print("\nLetters without URLs:")
    print(letters_df_deduped[letters_df_deduped['full_url'].isnull()][['Index', 'Name', 'Audience', 'Date', 'Language']].to_string())

print("\n" + "="*100)
print("LANGUAGE BREAKDOWN BY TYPE")
print("="*100)
lang_type_crosstab = pd.crosstab(letters_df_deduped['Language'], letters_df_deduped['Type'])
print(lang_type_crosstab)

print("\n" + "="*100)
print("FINAL SUMMARY")
print("="*100)
print(f"""
ðŸ“Š Total unique letters (deduplicated): {len(letters_df_deduped)}
ðŸ“Š Duplicates removed: {duplicate_count}
ðŸ“Š Letters with valid URLs: {len(letters_df_deduped) - missing_urls}
ðŸ“Š Letters missing URLs: {missing_urls}

Language breakdown:
{letters_df_deduped['Language'].value_counts().to_string()}

Type breakdown:
{letters_df_deduped['Type'].value_counts().to_string()}
""")

# Store the deduplicated dataset
letters_final = letters_df_deduped.copy()
print(f"\nâœ“ Deduplicated letters dataset created: 'letters_final' with {len(letters_final)} records")
print("="*100)

                              LETTERS DATASET - INITIAL FILTERING

Total letters extracted: 774
Original dataset size: 1466
Percentage of letters: 52.8%

CHECKING FOR DUPLICATES (by full_url)
Number of duplicate URLs: 12

Duplicate URLs found:
     Index                    Name                    Audience         Date Language                                                                                                  full_url
502    486                XXII Sir           Mitra, Pramadadas   1890/02/19  Bengali           https://www.ramakrishnavivekananda.info/vivekananda/volume_6/epistles_second_series/022_sir.htm
503    487      XXIII Akhandananda         Akhandananda, Swami  1890/02/19A  Bengali           https://www.ramakrishnavivekananda.info/vivekananda/volume_6/epistles_second_series/022_sir.htm
741    724                    Note                         NaN          NaN      NaN               https://www.ramakrishnavivekananda.info/vivekananda/volume_7/epistles_third_series/n

In [4]:
import requests
from bs4 import BeautifulSoup
import time
from tqdm import tqdm

print("="*100)
print(" "*30 + "HTML CONTENT SCRAPING")
print("="*100)

# Initialize column for HTML content
letters_final['html_content'] = None

# Function to scrape HTML content
def scrape_html(url):
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            return response.text
        else:
            return None
    except Exception as e:
        return None

# Scrape HTML content for all letters
print(f"\nScraping HTML content from {len(letters_final)} URLs...")
print("This may take a few minutes...\n")

success_count = 0
fail_count = 0

for idx, row in tqdm(letters_final.iterrows(), total=len(letters_final)):
    html_content = scrape_html(row['full_url'])
    if html_content:
        letters_final.at[idx, 'html_content'] = html_content
        success_count += 1
    else:
        fail_count += 1

    # Add a small delay to be respectful to the server
    time.sleep(0.1)

print("\n" + "="*100)
print("SCRAPING RESULTS")
print("="*100)
print(f"âœ“ Successfully scraped: {success_count}")
print(f"âœ— Failed to scrape: {fail_count}")
print(f"Success rate: {success_count/len(letters_final)*100:.1f}%")

# Check for missing HTML content
missing_html = letters_final['html_content'].isnull().sum()
print(f"\nLetters with missing HTML content: {missing_html}")

print("\n" + "="*100)
print("SAMPLE HTML CONTENT - Example 1 (English Letter)")
print("="*100)
# Find first English letter
english_letter = letters_final[letters_final['Language'] == 'English'].iloc[0]
print(f"Letter: {english_letter['Name']}")
print(f"To: {english_letter['Audience']}")
print(f"Date: {english_letter['Date']}")
print(f"URL: {english_letter['full_url']}")
print(f"\nFirst 1500 characters of HTML:")
print(english_letter['html_content'][:1500] if english_letter['html_content'] else "No content")

print("\n" + "="*100)
print("SAMPLE HTML CONTENT - Example 2 (Bengali Letter)")
print("="*100)
# Find first Bengali letter
bengali_letter = letters_final[letters_final['Language'] == 'Bengali'].iloc[0]
print(f"Letter: {bengali_letter['Name']}")
print(f"To: {bengali_letter['Audience']}")
print(f"Date: {bengali_letter['Date']}")
print(f"URL: {bengali_letter['full_url']}")
print(f"\nFirst 1500 characters of HTML:")
print(bengali_letter['html_content'][:1500] if bengali_letter['html_content'] else "No content")

print("\n" + "="*100)
print("SAMPLE HTML CONTENT - Example 3 (Another English Letter)")
print("="*100)
# Find a different English letter
english_letter2 = letters_final[letters_final['Language'] == 'English'].iloc[10]
print(f"Letter: {english_letter2['Name']}")
print(f"To: {english_letter2['Audience']}")
print(f"Date: {english_letter2['Date']}")
print(f"URL: {english_letter2['full_url']}")
print(f"\nFirst 1500 characters of HTML:")
print(english_letter2['html_content'][:1500] if english_letter2['html_content'] else "No content")

print("\n" + "="*100)
print("HTML LENGTH STATISTICS")
print("="*100)
# Calculate HTML lengths
letters_final['html_length'] = letters_final['html_content'].apply(lambda x: len(x) if x else 0)
print(letters_final['html_length'].describe())

print("\nâœ“ HTML scraping complete!")
print("="*100)

                              HTML CONTENT SCRAPING

Scraping HTML content from 762 URLs...
This may take a few minutes...



100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 762/762 [03:19<00:00,  3.82it/s]


SCRAPING RESULTS
âœ“ Successfully scraped: 762
âœ— Failed to scrape: 0
Success rate: 100.0%

Letters with missing HTML content: 0

SAMPLE HTML CONTENT - Example 1 (English Letter)
Letter: II Panditji Maharaj
To: Shankarlal, Pandit
Date: 1892/09/20
URL: https://www.ramakrishnavivekananda.info/vivekananda/volume_5/epistles_first_series/002_panditji_maharaj.htm

First 1500 characters of HTML:
<!DOCTYPE html>
<html><head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
<link rel="stylesheet" type="text/css" href="../../../main.css"><title>II Panditji Maharaj</title>

<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.9.1/jquery.min.js"></script>
<script src="/search.js"></script>
</head>
<body>
<div>
<p class="nav"><a href="001_fakir.htm" class="arrow">&#8592;</a>
<a href="../../../index.htm">Home</a>
/ <a href="../../complete_works.htm">Complete-Works</a>
/ <a href="../volume_5_contents.htm">Volume 5</a>
/&nbsp;<a href="epistles_first_series_contents.htm




In [19]:
from bs4 import BeautifulSoup
import re
import random

print("="*100)
print(" "*10 + "COMPLETE REWRITE - Proper extraction order: Sal â†’ Body â†’ Sig â†’ PS")
print("="*100)

def extract_letter_components_complete(html_content, language):
    """
    Complete rewrite with correct logic:
    1. Skip right-aligned at START (date/location)
    2. Extract salutation (first non-right paragraph)
    3. Find ALL right-aligned paragraphs at the END (working backwards, stopping at first left-aligned)
    4. Everything AFTER those right-aligned paragraphs = PS
    5. Everything BETWEEN salutation and signature = Body
    """
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all paragraphs
    all_paragraphs = soup.find_all('p')

    # Filter out navigation and chronology
    content_paragraphs = []

    for p in all_paragraphs:
        if p.get('class') and 'nav' in p.get('class'):
            continue
        if p.get('class') and 'center' in p.get('class'):
            text = p.get_text(strip=True)
            if 'Chronology' in text or 'Addressee' in text:
                continue
        content_paragraphs.append(p)

    # Get text AND class info from paragraphs
    paragraph_data = []
    for p in content_paragraphs:
        text = p.get_text(strip=True)
        if text:
            has_right_class = p.get('class') and 'right' in p.get('class')
            paragraph_data.append({
                'text': text,
                'is_right_aligned': has_right_class
            })

    # Language-specific extraction (remove translation markers)
    if language == 'Sanskrit':
        translation_started = False
        filtered_data = []
        for item in paragraph_data:
            text = item['text']
            if 'TRANSLATION' in text or 'Translated from Sanskrit' in text:
                translation_started = True
                if text == 'TRANSLATION' or 'Translated from Sanskrit' in text:
                    continue
            if translation_started:
                filtered_data.append(item)
        paragraph_data = filtered_data

    elif language == 'Bengali':
        paragraph_data = [item for item in paragraph_data
                          if not ('Translated from' in item['text'] and 'Bengali' in item['text'])]

    elif language == 'French':
        filtered = []
        for item in paragraph_data:
            text = item['text']
            if 'Translated from' in text and 'French' in text:
                continue
            if text.startswith('To ') and len(text) < 50:
                continue
            filtered.append(item)
        paragraph_data = filtered

    # Clean up whitespace
    for item in paragraph_data:
        item['text'] = re.sub(r'\s+', ' ', item['text']).strip()

    if len(paragraph_data) == 0:
        return None, None, None, None

    # STEP 1: Skip right-aligned at BEGINNING (date/location metadata)
    start_idx = 0
    while start_idx < len(paragraph_data) and paragraph_data[start_idx]['is_right_aligned']:
        start_idx += 1

    if start_idx >= len(paragraph_data):
        return None, None, None, None

    # STEP 2: Extract salutation (first non-right paragraph if <= 50 words)
    salutation = None
    body_start_idx = start_idx

    first_para = paragraph_data[start_idx]
    if len(first_para['text'].split()) <= 50:
        salutation = first_para['text']
        body_start_idx = start_idx + 1

    if body_start_idx >= len(paragraph_data):
        return salutation, None, None, None

    # STEP 3: Find signature by working BACKWARDS from end
    # Collect ALL consecutive right-aligned paragraphs from the end
    signature_paragraphs = []
    signature_start_idx = None
    signature_end_idx = None

    # Work backwards from the end
    i = len(paragraph_data) - 1

    # First, skip any trailing left-aligned paragraphs (these might be PS)
    while i >= body_start_idx and not paragraph_data[i]['is_right_aligned']:
        i -= 1

    # Now collect consecutive right-aligned paragraphs
    while i >= body_start_idx and paragraph_data[i]['is_right_aligned']:
        signature_paragraphs.insert(0, paragraph_data[i]['text'])
        signature_end_idx = i
        if signature_start_idx is None:
            signature_start_idx = i
        else:
            signature_start_idx = i
        i -= 1

    # STEP 4: Extract PS - anything AFTER signature_end_idx that starts with PS/P.S.
    ps_paragraphs = []
    if signature_end_idx is not None:
        for j in range(signature_end_idx + 1, len(paragraph_data)):
            text = paragraph_data[j]['text']
            if text.upper().startswith('PS') or text.upper().startswith('P.S'):
                ps_paragraphs.append(text)

    # STEP 5: Body is everything between salutation and signature
    body_end_idx = signature_start_idx if signature_start_idx is not None else len(paragraph_data)
    body_paragraphs = [item['text'] for item in paragraph_data[body_start_idx:body_end_idx]]

    # Join components
    salutation_text = salutation if salutation else ""
    body_text = '\n\n'.join(body_paragraphs) if body_paragraphs else ""
    signature_text = '\n\n'.join(signature_paragraphs) if signature_paragraphs else ""
    ps_text = '\n\n'.join(ps_paragraphs) if ps_paragraphs else ""

    return salutation_text, body_text, signature_text, ps_text


# Test on same 300 letters
random.seed(42)
sample_indices = random.sample(range(len(letters_final)), min(300, len(letters_final)))
sample_letters = letters_final.iloc[sample_indices]

print(f"\nExtracting components from {len(sample_letters)} random letters...\n")

results = []

for idx, row in sample_letters.iterrows():
    salutation, body, signature, ps = extract_letter_components_complete(row['html_content'], row['Language'])

    sal_words = len(salutation.split()) if salutation else 0
    body_words = len(body.split()) if body else 0
    sig_words = len(signature.split()) if signature else 0
    ps_words = len(ps.split()) if ps else 0

    results.append({
        'index': idx,
        'name': row['Name'],
        'audience': row['Audience'],
        'language': row['Language'],
        'salutation': salutation,
        'body': body,
        'signature': signature,
        'ps': ps,
        'salutation_words': sal_words,
        'body_words': body_words,
        'signature_words': sig_words,
        'ps_words': ps_words,
        'total_words': sal_words + body_words + sig_words + ps_words,
        'full_url': row['full_url']
    })

results_df = pd.DataFrame(results)

print("="*100)
print("STATISTICS - COMPLETE REWRITE")
print("="*100)

print("\n--- SALUTATION ---")
print(results_df['salutation_words'].describe())
print(f"Missing: {(results_df['salutation_words'] == 0).sum()}")

print("\n--- BODY ---")
print(results_df['body_words'].describe())
print(f"Missing: {(results_df['body_words'] == 0).sum()}")

print("\n--- SIGNATURE ---")
print(results_df['signature_words'].describe())
print(f"Missing: {(results_df['signature_words'] == 0).sum()}")

print("\n--- PS ---")
print(results_df['ps_words'].describe())
print(f"Missing: {(results_df['ps_words'] == 0).sum()}")

print("\n" + "="*100)
print("CRITICAL TEST: MRS. HANSBROUGH LETTER")
print("="*100)

hansbrough = results_df[results_df['name'] == 'CCXVII Mrs. Hansbrough']
if len(hansbrough) > 0:
    row = hansbrough.iloc[0]
    print(f"\n{row['name']} â†’ {row['audience']}")
    print(f"URL: {row['full_url']}")
    print(f"\n[SALUTATION ({row['salutation_words']}w)] {row['salutation']}")
    print(f"\n[BODY ({row['body_words']}w) - first 200 chars] {row['body'][:200]}...")
    print(f"\n[SIGNATURE ({row['signature_words']}w)] {row['signature']}")
    print(f"\n[PS ({row['ps_words']}w)] {row['ps']}")

print("\n" + "="*100)
print("LETTERS WITH PS (showing all)")
print("="*100)

with_ps = results_df[results_df['ps_words'] > 0]
print(f"Total letters with PS: {len(with_ps)}")
for _, row in with_ps.head(20).iterrows():
    print(f"\n{row['name']} â†’ {row['audience']}")
    print(f"URL: {row['full_url']}")
    print(f"PS: {row['ps'][:150]}")

print("\n" + "="*100)
print("SAMPLE EXTRACTIONS FOR MANUAL VERIFICATION")
print("="*100)

# Sample different cases
print("\n--- LETTERS WITH ALL COMPONENTS (Sal + Body + Sig + PS) ---")
complete = results_df[(results_df['salutation_words'] > 0) &
                      (results_df['body_words'] > 0) &
                      (results_df['signature_words'] > 0) &
                      (results_df['ps_words'] > 0)]
print(f"Found {len(complete)} complete letters")
for _, row in complete.head(5).iterrows():
    print(f"\n{row['name']} â†’ {row['audience']}")
    print(f"URL: {row['full_url']}")
    print(f"Sal({row['salutation_words']}w) Body({row['body_words']}w) Sig({row['signature_words']}w) PS({row['ps_words']}w)")

print("\n--- LETTERS WITH NO SIGNATURE ---")
no_sig = results_df[results_df['signature_words'] == 0]
print(f"Found {len(no_sig)} letters without signature")
for _, row in no_sig.head(5).iterrows():
    print(f"\n{row['name']} â†’ {row['audience']}")
    print(f"URL: {row['full_url']}")
    print(f"Body ends: ...{row['body'][-100:] if row['body'] else 'None'}")

print("\n" + "="*100)
print("SIGNATURE SAMPLES (First 30)")
print("="*100)

sigs = results_df[results_df['signature_words'] > 0].head(30)
for _, row in sigs.iterrows():
    sig_display = row['signature'].replace('\n\n', ' | ')
    print(f"{row['name']} ({row['signature_words']}w): {sig_display}")

print("\n" + "="*100)

          COMPLETE REWRITE - Proper extraction order: Sal â†’ Body â†’ Sig â†’ PS

Extracting components from 300 random letters...

STATISTICS - COMPLETE REWRITE

--- SALUTATION ---
count    300.000000
mean       4.730000
std        4.734111
min        0.000000
25%        3.000000
50%        4.000000
75%        5.000000
max       48.000000
Name: salutation_words, dtype: float64
Missing: 1

--- BODY ---
count     300.000000
mean      333.756667
std       362.637686
min         0.000000
25%       118.750000
50%       220.000000
75%       402.250000
max      2704.000000
Name: body_words, dtype: float64
Missing: 5

--- SIGNATURE ---
count    300.000000
mean       3.486667
std        2.120884
min        0.000000
25%        2.000000
50%        3.000000
75%        5.000000
max       14.000000
Name: signature_words, dtype: float64
Missing: 1

--- PS ---
count    300.000000
mean       2.050000
std       11.250307
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
m

In [23]:
from bs4 import BeautifulSoup
import re
import random

print("="*100)
print(" "*20 + "SIMPLE CORRECT LOGIC - Following actual letter structure")
print("="*100)

def extract_letter_simple(html_content, language):
    """
    SIMPLE extraction following the actual structure:
    1. Skip RIGHT paragraphs at start (date/location)
    2. First LEFT = salutation (if short)
    3. All LEFT paragraphs = body
    4. RIGHT paragraphs at END = signature (if any exist at end)
    5. LEFT after signature starting with PS = PS
    """
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all paragraphs
    all_paragraphs = soup.find_all('p')

    # Filter out navigation and chronology
    content_paragraphs = []
    for p in all_paragraphs:
        if p.get('class') and 'nav' in p.get('class'):
            continue
        if p.get('class') and 'center' in p.get('class'):
            text = p.get_text(strip=True)
            if 'Chronology' in text or 'Addressee' in text:
                continue
        content_paragraphs.append(p)

    # Build list with text and alignment
    paragraphs = []
    for p in content_paragraphs:
        text = p.get_text(strip=True)
        if text:
            is_right = p.get('class') and 'right' in p.get('class')
            paragraphs.append({'text': text, 'right': is_right})

    # Language-specific filtering
    if language == 'Sanskrit':
        translation_started = False
        filtered = []
        for item in paragraphs:
            if 'TRANSLATION' in item['text'] or 'Translated from Sanskrit' in item['text']:
                translation_started = True
                if item['text'] in ['TRANSLATION', 'Translated from Sanskrit']:
                    continue
            if translation_started:
                filtered.append(item)
        paragraphs = filtered
    elif language == 'Bengali':
        paragraphs = [p for p in paragraphs if not ('Translated from' in p['text'] and 'Bengali' in p['text'])]
    elif language == 'French':
        filtered = []
        for p in paragraphs:
            if 'Translated from' in p['text'] and 'French' in p['text']:
                continue
            if p['text'].startswith('To ') and len(p['text']) < 50:
                continue
            filtered.append(p)
        paragraphs = filtered

    # Clean whitespace
    for p in paragraphs:
        p['text'] = re.sub(r'\s+', ' ', p['text']).strip()

    if not paragraphs:
        return "", "", "", ""

    # STEP 1: Skip right-aligned at START (date/location metadata)
    idx = 0
    while idx < len(paragraphs) and paragraphs[idx]['right']:
        idx += 1

    if idx >= len(paragraphs):
        return "", "", "", ""

    # STEP 2: Salutation = first LEFT paragraph (if <= 50 words)
    salutation = ""
    if not paragraphs[idx]['right'] and len(paragraphs[idx]['text'].split()) <= 50:
        salutation = paragraphs[idx]['text']
        idx += 1

    # STEP 3: Find where signature starts (first RIGHT paragraph from the END, working backwards)
    sig_start = None
    for i in range(len(paragraphs) - 1, idx - 1, -1):
        if paragraphs[i]['right']:
            # Found a right-aligned paragraph
            # Check if there are consecutive right-aligned paragraphs before it
            sig_start = i
            while sig_start > idx and paragraphs[sig_start - 1]['right']:
                sig_start -= 1
            break

    # STEP 4: Extract components based on signature location
    body_paras = []
    sig_paras = []
    ps_paras = []

    if sig_start is not None:
        # Have signature
        # Body: LEFT paragraphs from idx to sig_start
        for i in range(idx, sig_start):
            if not paragraphs[i]['right']:
                body_paras.append(paragraphs[i]['text'])

        # Signature: RIGHT paragraphs starting at sig_start
        i = sig_start
        while i < len(paragraphs) and paragraphs[i]['right']:
            sig_paras.append(paragraphs[i]['text'])
            i += 1

        # PS: LEFT paragraphs after signature that start with PS
        while i < len(paragraphs):
            if not paragraphs[i]['right']:
                text = paragraphs[i]['text']
                if text.upper().startswith('PS') or text.upper().startswith('P.S'):
                    ps_paras.append(text)
            i += 1
    else:
        # No signature
        # Body: all remaining LEFT paragraphs (check for PS)
        for i in range(idx, len(paragraphs)):
            if not paragraphs[i]['right']:
                text = paragraphs[i]['text']
                if text.upper().startswith('PS') or text.upper().startswith('P.S'):
                    ps_paras.append(text)
                else:
                    body_paras.append(text)

    # Join components
    body = '\n\n'.join(body_paras)
    signature = '\n\n'.join(sig_paras)
    ps = '\n\n'.join(ps_paras)

    return salutation, body, signature, ps


# Test on same 300
random.seed(42)
sample_indices = random.sample(range(len(letters_final)), min(300, len(letters_final)))
sample_letters = letters_final.iloc[sample_indices]

print(f"\nTesting on {len(sample_letters)} letters...\n")

results = []
for idx, row in sample_letters.iterrows():
    sal, body, sig, ps = extract_letter_simple(row['html_content'], row['Language'])

    results.append({
        'index': idx,
        'name': row['Name'],
        'audience': row['Audience'],
        'language': row['Language'],
        'salutation': sal,
        'body': body,
        'signature': sig,
        'ps': ps,
        'salutation_words': len(sal.split()) if sal else 0,
        'body_words': len(body.split()) if body else 0,
        'signature_words': len(sig.split()) if sig else 0,
        'ps_words': len(ps.split()) if ps else 0,
        'full_url': row['full_url']
    })

results_df = pd.DataFrame(results)

print("="*100)
print("STATISTICS")
print("="*100)
for col in ['salutation_words', 'body_words', 'signature_words', 'ps_words']:
    print(f"\n{col}: mean={results_df[col].mean():.1f}, missing={len(results_df[results_df[col]==0])}")

print("\n" + "="*100)
print("MRS. HANSBROUGH")
print("="*100)
h = results_df[results_df['name'] == 'CCXVII Mrs. Hansbrough'].iloc[0]
print(f"Sal({h['salutation_words']}w): {h['salutation']}")
print(f"Body({h['body_words']}w): {h['body'][:150]}...")
print(f"Sig({h['signature_words']}w): {h['signature']}")
print(f"PS({h['ps_words']}w): {h['ps']}")

print("\n" + "="*100)
print("PROBLEM LETTERS")
print("="*100)
for name in ['CLXXII American Friend', 'LXXXIV Friend', 'CXLII Sister Christine']:
    matches = results_df[results_df['name'] == name]
    if len(matches) > 0:
        r = matches.iloc[0]
        print(f"\n{r['name']}: Sal({r['salutation_words']}w) Body({r['body_words']}w) Sig({r['signature_words']}w)")
        print(f"URL: {r['full_url']}")
        print(f"Signature: {r['signature'] if r['signature'] else '[NONE]'}")

print("\n" + "="*100)

                    SIMPLE CORRECT LOGIC - Following actual letter structure

Testing on 300 letters...

STATISTICS

salutation_words: mean=4.7, missing=1

body_words: mean=329.9, missing=5

signature_words: mean=3.5, missing=1

ps_words: mean=2.0, missing=280

MRS. HANSBROUGH
Sal(5w): To Mrs. Alice (Shanti) Hansbrough
Body(161w): MY DEAR MRS. HANSBROUGH,

I am eternally indebted to you for what you did for me in the past, and infinitely more now for what you are doing for Turiy...
Sig(6w): Ever yours in the Lord,

VIVEKANANDA
PS(21w): PS. Let Turiyananda take rest all the time now. He must not work at all till I reach Japan or America.

PROBLEM LETTERS

CLXXII American Friend: Sal(4w) Body(0w) Sig(4w)
URL: https://www.ramakrishnavivekananda.info/vivekananda/volume_8/epistles_fourth_series/172_american_friend.htm
Signature: SAN FRANCISCO,7th April, 1900.

LXXXIV Friend: Sal(5w) Body(0w) Sig(3w)
URL: https://www.ramakrishnavivekananda.info/vivekananda/volume_8/epistles_fourth_series/084

In [26]:
from bs4 import BeautifulSoup
import re
import random

print("="*100)
print(" "*20 + "FINAL EXTRACTION - Body includes salutation")
print("="*100)

def extract_letter_final(html_content, language):
    """
    Final extraction logic:
    - Body = ALL left-aligned text (includes salutation)
    - Signature = Consecutive RIGHT-aligned at the VERY END
    - PS = LEFT-aligned after signature starting with PS/P.S.
    """
    soup = BeautifulSoup(html_content, 'html.parser')

    all_paragraphs = soup.find_all('p')
    content_paragraphs = []

    for p in all_paragraphs:
        if p.get('class') and 'nav' in p.get('class'):
            continue
        if p.get('class') and 'center' in p.get('class'):
            text = p.get_text(strip=True)
            if 'Chronology' in text or 'Addressee' in text:
                continue
        content_paragraphs.append(p)

    paragraphs = []
    for p in content_paragraphs:
        text = p.get_text(strip=True)
        if text:
            is_right = p.get('class') and 'right' in p.get('class')
            paragraphs.append({'text': text, 'right': is_right})

    # Language-specific filtering
    if language == 'Sanskrit':
        translation_started = False
        filtered = []
        for item in paragraphs:
            if 'TRANSLATION' in item['text'] or 'Translated from Sanskrit' in item['text']:
                translation_started = True
                if item['text'] in ['TRANSLATION', 'Translated from Sanskrit']:
                    continue
            if translation_started:
                filtered.append(item)
        paragraphs = filtered
    elif language == 'Bengali':
        paragraphs = [p for p in paragraphs if not ('Translated from' in p['text'] and 'Bengali' in p['text'])]
    elif language == 'French':
        filtered = []
        for p in paragraphs:
            if 'Translated from' in p['text'] and 'French' in p['text']:
                continue
            if p['text'].startswith('To ') and len(p['text']) < 50:
                continue
            filtered.append(p)
        paragraphs = filtered

    # Clean whitespace
    for p in paragraphs:
        p['text'] = re.sub(r'\s+', ' ', p['text']).strip()

    if not paragraphs:
        return "", "", ""

    # STEP 1: Skip RIGHT-aligned at the VERY START (date/location before content)
    start_idx = 0
    while start_idx < len(paragraphs) and paragraphs[start_idx]['right']:
        start_idx += 1

    if start_idx >= len(paragraphs):
        return "", "", ""

    # STEP 2: Find signature - consecutive RIGHT paragraphs at the VERY END
    # Work backwards from end
    sig_start_idx = None
    i = len(paragraphs) - 1

    # Skip trailing LEFT paragraphs (could be PS or body)
    while i >= start_idx and not paragraphs[i]['right']:
        i -= 1

    # If we found RIGHT at/near the end, check if it's truly at the end
    if i >= start_idx and paragraphs[i]['right']:
        # Found RIGHT paragraph(s) near end
        # Check: are there ONLY PS paragraphs after this RIGHT section? (or nothing)
        end_of_right = i

        # Collect consecutive RIGHT paragraphs working backwards
        while i >= start_idx and paragraphs[i]['right']:
            i -= 1

        potential_sig_start = i + 1

        # Check what comes after this RIGHT section
        has_non_ps_left = False
        for j in range(end_of_right + 1, len(paragraphs)):
            if not paragraphs[j]['right']:
                text = paragraphs[j]['text']
                if not (text.upper().startswith('PS') or text.upper().startswith('P.S')):
                    has_non_ps_left = True
                    break

        # Only treat as signature if there's no non-PS left-aligned text after it
        if not has_non_ps_left:
            sig_start_idx = potential_sig_start

    # STEP 3: Extract body, signature, PS
    body_paras = []
    sig_paras = []
    ps_paras = []

    if sig_start_idx is not None:
        # Have signature
        # Body = all LEFT from start_idx to sig_start_idx
        for i in range(start_idx, sig_start_idx):
            if not paragraphs[i]['right']:
                body_paras.append(paragraphs[i]['text'])

        # Signature = all RIGHT from sig_start_idx
        i = sig_start_idx
        while i < len(paragraphs) and paragraphs[i]['right']:
            sig_paras.append(paragraphs[i]['text'])
            i += 1

        # PS = LEFT after signature starting with PS
        while i < len(paragraphs):
            if not paragraphs[i]['right']:
                text = paragraphs[i]['text']
                if text.upper().startswith('PS') or text.upper().startswith('P.S'):
                    ps_paras.append(text)
            i += 1
    else:
        # No signature
        # All LEFT = body or PS
        for i in range(start_idx, len(paragraphs)):
            if not paragraphs[i]['right']:
                text = paragraphs[i]['text']
                if text.upper().startswith('PS') or text.upper().startswith('P.S'):
                    ps_paras.append(text)
                else:
                    body_paras.append(text)

    body = '\n\n'.join(body_paras)
    signature = '\n\n'.join(sig_paras)
    ps = '\n\n'.join(ps_paras)

    return body, signature, ps


# Test on 300 sample
random.seed(42)
sample_indices = random.sample(range(len(letters_final)), min(300, len(letters_final)))
sample_letters = letters_final.iloc[sample_indices]

print(f"Testing on {len(sample_letters)} letters...\n")

results = []
for idx, row in sample_letters.iterrows():
    body, sig, ps = extract_letter_final(row['html_content'], row['Language'])

    results.append({
        'name': row['Name'],
        'audience': row['Audience'],
        'language': row['Language'],
        'body': body,
        'signature': sig,
        'ps': ps,
        'body_words': len(body.split()) if body else 0,
        'signature_words': len(sig.split()) if sig else 0,
        'ps_words': len(ps.split()) if ps else 0,
        'full_url': row['full_url']
    })

results_df = pd.DataFrame(results)

print("="*100)
print("STATISTICS")
print("="*100)
print(f"Body missing: {len(results_df[results_df['body_words']==0])}")
print(f"Signature missing: {len(results_df[results_df['signature_words']==0])}")
print(f"PS missing: {len(results_df[results_df['ps_words']==0])}")

print(f"\nBody words: mean={results_df['body_words'].mean():.1f}, median={results_df['body_words'].median():.1f}")
print(f"Signature words: mean={results_df['signature_words'].mean():.1f}, median={results_df['signature_words'].median():.1f}")
print(f"PS words: mean={results_df['ps_words'].mean():.1f}")

print("\n" + "="*100)
print("TEST CASES")
print("="*100)

# Mrs. Hansbrough
h = results_df[results_df['name'] == 'CCXVII Mrs. Hansbrough'].iloc[0]
print(f"\nMrs. Hansbrough:")
print(f"Body({h['body_words']}w): {h['body'][:150]}...")
print(f"Sig({h['signature_words']}w): {h['signature']}")
print(f"PS({h['ps_words']}w): {h['ps']}")

# Problem letters
print("\n" + "-"*100)
for name in ['CLXXII American Friend', 'LXXXIV Friend', 'CXLII Sister Christine']:
    matches = results_df[results_df['name'] == name]
    if len(matches) > 0:
        r = matches.iloc[0]
        print(f"\n{r['name']}:")
        print(f"Body({r['body_words']}w): {r['body'][:100] if r['body'] else '[NONE]'}...")
        print(f"Sig({r['signature_words']}w): {r['signature'] if r['signature'] else '[NONE]'}")

print("\n" + "="*100)
print("SIGNATURE SAMPLES (first 20)")
print("="*100)
sigs = results_df[results_df['signature_words'] > 0].head(20)
for _, r in sigs.iterrows():
    print(f"{r['name']} ({r['signature_words']}w): {r['signature'].replace(chr(10)+chr(10), ' | ')}")

print("\n" + "="*100)

                    FINAL EXTRACTION - Body includes salutation
Testing on 300 letters...

STATISTICS
Body missing: 0
Signature missing: 10
PS missing: 280

Body words: mean=338.6, median=223.0
Signature words: mean=3.3, median=3.0
PS words: mean=2.0

TEST CASES

Mrs. Hansbrough:
Body(166w): To Mrs. Alice (Shanti) Hansbrough

MY DEAR MRS. HANSBROUGH,

I am eternally indebted to you for what you did for me in the past, and infinitely more n...
Sig(6w): Ever yours in the Lord,

VIVEKANANDA
PS(21w): PS. Let Turiyananda take rest all the time now. He must not work at all till I reach Japan or America.

----------------------------------------------------------------------------------------------------

CLXXII American Friend:
Body(164w): To an American friend

. . . I am more calm and quiet now than I ever was. I am on my own feet, work...
Sig(0w): [NONE]

LXXXIV Friend:
Body(91w): To Mr. E. T. Sturdy

DEAR FRIEND,

I have at last seen Prof. Deussen. . . . The whole of yesterday w...
Sig(0

In [27]:
print("="*100)
print(" "*20 + "APPLYING EXTRACTION TO ALL 762 LETTERS")
print("="*100)

# Apply to ALL letters
print(f"\nProcessing {len(letters_final)} letters...\n")

all_results = []

for idx, row in letters_final.iterrows():
    body, sig, ps = extract_letter_final(row['html_content'], row['Language'])

    all_results.append({
        'index': idx,
        'name': row['Name'],
        'audience': row['Audience'],
        'language': row['Language'],
        'date': row['Date'],
        'body': body,
        'signature': sig,
        'ps': ps,
        'body_words': len(body.split()) if body else 0,
        'signature_words': len(sig.split()) if sig else 0,
        'ps_words': len(ps.split()) if ps else 0,
        'full_url': row['full_url']
    })

all_results_df = pd.DataFrame(all_results)

print("="*100)
print("FINAL STATISTICS - ALL 762 LETTERS")
print("="*100)

print(f"\nTotal letters processed: {len(all_results_df)}")

print("\n--- BODY ---")
print(all_results_df['body_words'].describe())
print(f"Missing: {len(all_results_df[all_results_df['body_words']==0])} ({len(all_results_df[all_results_df['body_words']==0])/len(all_results_df)*100:.1f}%)")

print("\n--- SIGNATURE ---")
print(all_results_df['signature_words'].describe())
print(f"Missing: {len(all_results_df[all_results_df['signature_words']==0])} ({len(all_results_df[all_results_df['signature_words']==0])/len(all_results_df)*100:.1f}%)")

print("\n--- PS ---")
print(all_results_df['ps_words'].describe())
print(f"Missing: {len(all_results_df[all_results_df['ps_words']==0])} ({len(all_results_df[all_results_df['ps_words']==0])/len(all_results_df)*100:.1f}%)")

print("\n" + "="*100)
print("LANGUAGE BREAKDOWN")
print("="*100)
print(all_results_df['language'].value_counts())

print("\n" + "="*100)
print("POTENTIAL ISSUES TO CHECK")
print("="*100)

# Very short bodies (might be extraction errors)
print("\n1. VERY SHORT BODIES (<30 words):")
short_bodies = all_results_df[all_results_df['body_words'] < 30].sort_values('body_words')
print(f"Found {len(short_bodies)} letters")
for _, r in short_bodies.head(10).iterrows():
    print(f"\n{r['name']} ({r['body_words']}w)")
    print(f"URL: {r['full_url']}")
    print(f"Body: {r['body'][:150]}")

# Very long signatures (might be wrong)
print("\n\n2. VERY LONG SIGNATURES (>15 words):")
long_sigs = all_results_df[all_results_df['signature_words'] > 15].sort_values('signature_words', ascending=False)
print(f"Found {len(long_sigs)} letters")
for _, r in long_sigs.head(10).iterrows():
    print(f"\n{r['name']} ({r['signature_words']}w)")
    print(f"URL: {r['full_url']}")
    print(f"Signature: {r['signature'][:200]}")

# Very long PS (might be wrong)
print("\n\n3. VERY LONG PS (>100 words):")
long_ps = all_results_df[all_results_df['ps_words'] > 100].sort_values('ps_words', ascending=False)
print(f"Found {len(long_ps)} letters")
for _, r in long_ps.head(5).iterrows():
    print(f"\n{r['name']} ({r['ps_words']}w)")
    print(f"URL: {r['full_url']}")
    print(f"PS (first 200 chars): {r['ps'][:200]}")

# Missing body (critical error)
print("\n\n4. MISSING BODY (CRITICAL ERROR):")
no_body = all_results_df[all_results_df['body_words'] == 0]
print(f"Found {len(no_body)} letters")
for _, r in no_body.iterrows():
    print(f"\n{r['name']}")
    print(f"URL: {r['full_url']}")
    print(f"Sig: {r['signature'][:100] if r['signature'] else '[NONE]'}")
    print(f"PS: {r['ps'][:100] if r['ps'] else '[NONE]'}")

print("\n" + "="*100)
print("UNIQUE SIGNATURES (showing 50 random ones)")
print("="*100)
unique_sigs = all_results_df[all_results_df['signature_words'] > 0]['signature'].unique()
print(f"Total unique signatures: {len(unique_sigs)}")
print("\nRandom sample of 50:")
import random
random.seed(42)
sample_sigs = random.sample(list(unique_sigs), min(50, len(unique_sigs)))
for i, sig in enumerate(sample_sigs, 1):
    display_sig = sig.replace('\n\n', ' | ')[:150]
    print(f"{i}. {display_sig}")

print("\n" + "="*100)
print("SUMMARY")
print("="*100)
print(f"""
âœ“ Total letters: {len(all_results_df)}
âœ“ Letters with body: {len(all_results_df[all_results_df['body_words'] > 0])} ({len(all_results_df[all_results_df['body_words'] > 0])/len(all_results_df)*100:.1f}%)
âœ“ Letters with signature: {len(all_results_df[all_results_df['signature_words'] > 0])} ({len(all_results_df[all_results_df['signature_words'] > 0])/len(all_results_df)*100:.1f}%)
âœ“ Letters with PS: {len(all_results_df[all_results_df['ps_words'] > 0])} ({len(all_results_df[all_results_df['ps_words'] > 0])/len(all_results_df)*100:.1f}%)

Average lengths:
- Body: {all_results_df['body_words'].mean():.1f} words (median: {all_results_df['body_words'].median():.1f})
- Signature: {all_results_df['signature_words'].mean():.1f} words (median: {all_results_df['signature_words'].median():.1f})
- PS: {all_results_df['ps_words'].mean():.1f} words
""")

print("="*100)

                    APPLYING EXTRACTION TO ALL 762 LETTERS

Processing 762 letters...

FINAL STATISTICS - ALL 762 LETTERS

Total letters processed: 762

--- BODY ---
count     762.000000
mean      332.551181
std       363.602143
min        16.000000
25%       115.000000
50%       222.000000
75%       410.750000
max      3147.000000
Name: body_words, dtype: float64
Missing: 0 (0.0%)

--- SIGNATURE ---
count    762.000000
mean       3.274278
std        1.973436
min        0.000000
25%        2.000000
50%        3.000000
75%        5.000000
max       14.000000
Name: signature_words, dtype: float64
Missing: 29 (3.8%)

--- PS ---
count    762.000000
mean       2.585302
std       14.851298
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max      263.000000
Name: ps_words, dtype: float64
Missing: 706 (92.7%)

LANGUAGE BREAKDOWN
language
English     604
Bengali     148
Sanskrit      3
French        3
 English      1
Name: count, dtype: int64

POTENTIAL ISSUES TO

In [28]:
print("="*100)
print(" "*25 + "CREATING FINAL DATASET WITH EXTRACTED COMPONENTS")
print("="*100)

# Merge the extracted components back into the original dataset
letters_final_with_extraction = letters_final.copy()

# Add the three new columns
letters_final_with_extraction['body_text'] = all_results_df['body'].values
letters_final_with_extraction['signature_text'] = all_results_df['signature'].values
letters_final_with_extraction['ps_text'] = all_results_df['ps'].values

# Also add word counts for convenience
letters_final_with_extraction['body_word_count'] = all_results_df['body_words'].values
letters_final_with_extraction['signature_word_count'] = all_results_df['signature_words'].values
letters_final_with_extraction['ps_word_count'] = all_results_df['ps_words'].values

print(f"\nOriginal columns: {len(letters_final.columns)}")
print(f"New columns added: 6 (body_text, signature_text, ps_text, body_word_count, signature_word_count, ps_word_count)")
print(f"Total columns now: {len(letters_final_with_extraction.columns)}")

print("\n" + "="*100)
print("COLUMN LIST")
print("="*100)
print(list(letters_final_with_extraction.columns))

print("\n" + "="*100)
print("SAMPLE RECORD")
print("="*100)
sample = letters_final_with_extraction[letters_final_with_extraction['Name'] == 'CCXVII Mrs. Hansbrough'].iloc[0]
print(f"\nName: {sample['Name']}")
print(f"Audience: {sample['Audience']}")
print(f"Date: {sample['Date']}")
print(f"Language: {sample['Language']}")
print(f"\nBody (first 200 chars): {sample['body_text'][:200]}...")
print(f"Body word count: {sample['body_word_count']}")
print(f"\nSignature: {sample['signature_text']}")
print(f"Signature word count: {sample['signature_word_count']}")
print(f"\nPS: {sample['ps_text']}")
print(f"PS word count: {sample['ps_word_count']}")

print("\n" + "="*100)
print("EXPORTING TO CSV")
print("="*100)

# Export to CSV
output_filename = 'swami_vivekananda_letters_complete.csv'
letters_final_with_extraction.to_csv(output_filename, index=False)

print(f"âœ“ Successfully exported to: {output_filename}")
print(f"âœ“ Total records: {len(letters_final_with_extraction)}")
print(f"âœ“ Total columns: {len(letters_final_with_extraction.columns)}")

# Show file size
import os
file_size = os.path.getsize(output_filename)
print(f"âœ“ File size: {file_size / (1024*1024):.2f} MB")

print("\n" + "="*100)
print("DATASET READY FOR ANALYSIS!")
print("="*100)
print(f"""
You can now analyze:
1. Signature patterns (247 unique signatures!)
2. How signatures vary by recipient
3. How signatures change over time (1889-1902)
4. Body content analysis
5. PS usage patterns
6. Language differences (English vs Bengali)

The dataset includes:
- All original metadata (Name, Date, Place, Audience, etc.)
- HTML content (for reference)
- Extracted body_text (includes salutation)
- Extracted signature_text
- Extracted ps_text
- Word counts for each component
""")

print("="*100)

                         CREATING FINAL DATASET WITH EXTRACTED COMPONENTS

Original columns: 21
New columns added: 6 (body_text, signature_text, ps_text, body_word_count, signature_word_count, ps_word_count)
Total columns now: 27

COLUMN LIST
['Index', 'volume', 'Name', 'Category', 'Type', 'Date', 'Place', 'Source', 'Audience', 'Language', 'section', 'genre', 'href', 'full_url', 'filename', 'has_html', 'has_date', 'has_place', 'has_audience', 'html_content', 'html_length', 'body_text', 'signature_text', 'ps_text', 'body_word_count', 'signature_word_count', 'ps_word_count']

SAMPLE RECORD

Name: CCXVII Mrs. Hansbrough
Audience: Hansbrough, Alice (Shanti)
Date: 1902/02/14
Language: English

Body (first 200 chars): To Mrs. Alice (Shanti) Hansbrough

MY DEAR MRS. HANSBROUGH,

I am eternally indebted to you for what you did for me in the past, and infinitely more now for what you are doing for Turiyananda.

A gloo...
Body word count: 166

Signature: Ever yours in the Lord,

VIVEKANANDA
Sign