# PDF/TXT_to_Sentence_Level_CSV

## Marker AI PDF to Markdown File

In [None]:
import subprocess
subprocess.run(["marker_single", "netherland2011.pdf"], timeout=300)

## BeautifulSoup Markdown (HTML) to TXT

In [2]:
import markdown
from bs4 import BeautifulSoup

def markdown_to_text(md_path, txt_path):
    # Read the markdown file
    with open(md_path, 'r', encoding='utf-8') as md_file:
        md_content = md_file.read()

    # Convert markdown to HTML
    html = markdown.markdown(md_content)

    # Strip HTML tags to get plain text
    soup = BeautifulSoup(html, features="html.parser")
    text = soup.get_text()

    # Write the plain text to a .txt file
    with open(txt_path, 'w', encoding='utf-8') as txt_file:
        txt_file.write(text)

# files = ['southafrica2015', 'china2016', 'india2013', 'netherland2011']
# for filename in files:
#     markdown_to_text(f"{filename}/{filename}.md", filename + ".txt")

filename = 'china2016'
markdown_to_text(f"{filename}/{filename}_clean.md", filename + "_cleaned.txt")

In [2]:
!pip install pandas

Collecting pandas
  Downloading pandas-2.3.0-cp312-cp312-macosx_10_13_x86_64.whl.metadata (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.3.0-cp312-cp312-macosx_10_13_x86_64.whl (11.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hDownloading pytz-2025.2-py2.py3-none-any.whl (509 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m509.2/509.2 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tzdata-2025.2-py2.py3-none-any.whl (347 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m347.8/347.8 kB[0m [31m8.5 MB/s[0m eta 

## Text Cleaning Using Regex (symbols, numbers) + TXT to CSV

In [4]:
from pathlib import Path
import re
import pandas as pd

files = ['india2013', 'china2016', 'southafrica2015', 'netherland2011']
files = ['china2016_cleaned']
for file in files:
    # Input/output file paths
    input_path = Path(f'./{file}.txt')
    output_csv = Path(str(input_path)[:-4] + '.csv')

    # Read and clean the raw text
    text = input_path.read_text(encoding='utf-8')

    # Remove bullet points, list markers, etc.
    text = re.sub(r"^\s*[\d\.\-\–•]+", "", text, flags=re.MULTILINE)

    # Remove short lines (e.g., headings)
    lines = [line.strip() for line in text.splitlines() if len(line.strip()) >= 5]

    # Join lines into one string
    text_combined = " ".join(lines)
    text_combined = re.sub(r'\s+', ' ', text_combined)  # normalize whitespace

    # Split into sentences
    sentences = re.split(r'(?<=[.!?]) +', text_combined)

   # Compile patterns
    allowed_pattern = re.compile(r"[^A-Za-z\s.,]")  # only allow letters, whitespace, comma, and period
    repeated_punct_pattern = re.compile(r"[.,]{2,}")  # remove repeated periods or commas
    hyphen_linebreak_pattern = re.compile(r"(\w)-\s+(\w)")  # join hyphenated line breaks
    link_pattern = re.compile(r'https?://\S+|www\.\S+')  # remove all links

    # Clean each sentence
    cleaned_sentences = []
    for sentence in sentences:
        cleaned = ' '.join(sentence.split())  # normalize whitespace
        cleaned = link_pattern.sub('', cleaned)  # remove URLs
        #cleaned = hyphen_linebreak_pattern.sub(r'\1\2', cleaned)  # join hyphenated words
        cleaned = repeated_punct_pattern.sub('', cleaned)  # remove .... or ,,,
        cleaned = allowed_pattern.sub('', cleaned)  # remove disallowed characters
        cleaned = ' '.join(cleaned.split())  # normalize whitespace again
        if len(cleaned) > 25:  # skip empty results
            cleaned_sentences.append(cleaned)

    # Save as CSV
    df = pd.DataFrame({'text': cleaned_sentences})
    df.to_csv(output_csv, index=False)

    print(f"CSV saved as: {output_csv.name}")


CSV saved as: china2016_cleaned.csv


# Guided Topic Modeling

In [6]:
from bertopic import BERTopic
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
import numpy as np
from bertopic import BERTopic

def create_guided_topic_model():
    """
    Create a BERTopic model with guided topics for internet governance analysis.
    
    Returns:
        BERTopic: Configured topic model with seed topics
    """
    
    # Define seed topics based on your research themes
    seed_topic_list = [
        # Openness & Freedom
        ["open", "openness", "free", "freedom", "free flow of information", 
         "access", "accessibility", "transparency", "interoperable", "common", "sharing"],
        
        # Human Rights & Democracy
        ["freedom of expression", "privacy", "democratic", "human rights", "civil liberties"],
        
        # Governance & Cooperation
        ["international cooperation", "global governance", "shared responsibility", 
         "inclusive", "collaboration"],
        
        # Innovation & Economy
        ["competition", "innovation", "entrepreneurship", "investment", "economic development"],
        
        # Regulatory Norms
        ["net neutrality", "no censorship", "device neutrality", "rule of law"],
        
        # Sovereignty & Control
        ["sovereignty", "cyberspace sovereignty", "national territory", 
         "territorial jurisdiction", "control", "manage", "govern"],
        
        # Security & Stability
        ["national security", "cybersecurity", "information security", 
         "public order", "regime stability", "cyber threats"],
        
        # Legal & Regulatory Power
        ["formulate laws", "legal measures", "constitutional authority", 
         "censorship", "information management"],
        
        # Protectionism & Defense
        ["safeguard", "protect", "defend", "uphold", "counter threats", "prevent subversion"],
        
        # Ideological & Nationalistic Framing
        ["foreign interference", "ideological security", "online subversion", 
         "cultural values", "strategic stability"],
        
        # Exclusive Framing
        ["within our borders", "according to national laws", "no foreign interference"]
    ]
    
    # Create vectorizer with better n-gram range for phrase detection
    vectorizer_model = CountVectorizer(
        ngram_range=(1, 3),  # Include unigrams, bigrams, and trigrams
        stop_words="english",
        min_df=2,  # Minimum document frequency
        max_df=0.95  # Maximum document frequency
    )
    
    # Create TF-IDF transformer
    ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
    
    # Initialize BERTopic model with guided topics
    topic_model = BERTopic(
        seed_topic_list=seed_topic_list,
        vectorizer_model=vectorizer_model,
        ctfidf_model=ctfidf_model,
        verbose=True,
        calculate_probabilities=True
    )
    
    return topic_model

def fit_and_analyze_topics(docs, topic_model):
    """
    Fit the topic model and return analysis results.
    
    Args:
        docs (list): List of documents to analyze
        topic_model (BERTopic): Configured topic model
        
    Returns:
        tuple: (topics, probabilities, topic_model)
    """
    
    print("Fitting topic model...")
    topics, probs = topic_model.fit_transform(docs)
    
    print(f"Number of topics found: {len(set(topics))}")
    print(f"Number of documents: {len(docs)}")
    
    return topics, probs, topic_model

def analyze_results(topic_model, topics, docs):
    """
    Analyze and display topic modeling results.
    
    Args:
        topic_model (BERTopic): Fitted topic model
        topics (list): Topic assignments for each document
        docs (list): Original documents
    """
    
    # Get topic information
    topic_info = topic_model.get_topic_info()
    print("\nTopic Information:")
    print(topic_info)
    
    # Show top words for each topic
    print("\nTop words per topic:")
    for topic_id in topic_info['Topic']:
        if topic_id != -1:  # Skip outlier topic
            words = topic_model.get_topic(topic_id)
            print(f"\nTopic {topic_id}:")
            print([word for word, _ in words[:]])
    
    print('******* Print topics: ')

    topic_info = topic_model.get_topic_info()
    print(topic_info)
    topic_names = [
        topic_info.loc[topic_info['Topic'] == topic, 'Name'].values[0]
        if topic in topic_info['Topic'].values else 'Outlier'
        for topic in topics
    ]
    
    # Create document-topic dataframe
    doc_topic_df = pd.DataFrame({
        'Document': docs,
        'Topic': topics,
        'Topic_Name': topic_names
    })
    
    # Show topic distribution
    topic_counts = doc_topic_df['Topic'].value_counts().sort_index()
    print("\nTopic Distribution:")
    print(topic_counts)
    
    return doc_topic_df

def save_results(topic_model, doc_topic_df):
    """
    Save topic modeling results to files.
    
    Args:
        topic_model (BERTopic): Fitted topic model
        doc_topic_df (pd.DataFrame): Document-topic assignments
        output_path (str): Base path for output files
    """
    output_path = 'guided_topic_modeling'
    # Save topic information
    topic_info = topic_model.get_topic_info()
    topic_info.to_csv(f"{output_path}_topic_info.csv", index=False)
    
    # Save document-topic assignments
    doc_topic_df.to_csv(f"{output_path}_doc_topics.csv", index=False)
    
    # Save detailed topic words
    with open(f"{output_path}_topic_words.txt", "w") as f:
        for topic_id in topic_info['Topic']:
            if topic_id != -1:
                words = topic_model.get_topic(topic_id)
                f.write(f"Topic {topic_id}:\n")
                f.write(", ".join([f"{word} ({score:.3f})" for word, score in words]))
                f.write("\n\n")
    
    print(f"Results saved to files with prefix: {output_path}")

In [3]:
from bertopic import BERTopic

In [9]:
def main_pipeline(docs):
    """
    Main pipeline function to run guided topic modeling.
    
    Args:
        docs (list): List of documents to analyze
        save_outputs (bool): Whether to save results to files
        
    Returns:
        tuple: (topic_model, topics, probabilities, doc_topic_df)
    """
    
    # Create topic model
    topic_model = create_guided_topic_model()
    
    # Fit model and get results
    topics, probs, fitted_model = fit_and_analyze_topics(docs, topic_model)
    
    # Analyze results
    doc_topic_df = analyze_results(fitted_model, topics, docs)
    
    # Save results if requested
    save_results(fitted_model, doc_topic_df)

    #return fitted_model, topics, probs
    return fitted_model, topics, probs, doc_topic_df

In [10]:
filenames = ['india2013', 'china2016', 'southafrica2015', 'netherland2011']
docs = pd.read_csv('southafrica2015.csv')['text'].tolist()
# topic_model, topics, probabilities = main_pipeline(docs)
topic_model, topics, probabilities, results_df = main_pipeline(docs)

# Optional: Generate visualizations
# topic_model.visualize_topics()
# topic_model.visualize_hierarchy()
# topic_model.visualize_barchart()

2025-07-04 18:13:35,615 - BERTopic - Embedding - Transforming documents to embeddings.


Fitting topic model...
Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/Users/wangcancan/anaconda3/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3526, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/2n/zg9tdyh15bq7k_499wgv2jgr0000gn/T/ipykernel_94692/3881583056.py", line 4, in <module>
    topic_model, topics, probabilities, results_df = main_pipeline(docs)
                                                     ^^^^^^^^^^^^^^^^^^^
  File "/var/folders/2n/zg9tdyh15bq7k_499wgv2jgr0000gn/T/ipykernel_94692/1942867647.py", line 17, in main_pipeline
    topics, probs, fitted_model = fit_and_analyze_topics(docs, topic_model)
                                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/2n/zg9tdyh15bq7k_499wgv2jgr0000gn/T/ipykernel_94692/3961726287.py", line 93, in fit_and_analyze_topics
    topics, probs = topic_model.fit_transform(docs)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/wangcancan/anaconda3/li