In [1]:
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph
from reportlab.lib.styles import getSampleStyleSheet

# Read the content of the text file
with open('ait_paragraphs.txt', 'r') as file:
    text_content = file.read()

# Split the content into paragraphs
paragraphs = text_content.split('\n')

# Create a PDF document
pdf_filename = 'ait_paragraphs.pdf'
doc = SimpleDocTemplate(pdf_filename, pagesize=letter)
styles = getSampleStyleSheet()
story = []

# Add each paragraph to the story
for paragraph in paragraphs:
    p = Paragraph(paragraph, styles['Normal'])
    story.append(p)

# Build the PDF document
doc.build(story)

print(f"PDF file '{pdf_filename}' created successfully.")


PDF file 'ait_paragraphs.pdf' created successfully.


In [2]:
import requests
from bs4 import BeautifulSoup
import os
import re
from urllib.parse import urlparse, urljoin

# Function to extract only English words from text
def extract_english_words(text):
    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()
    # Tokenize and filter English words
    words = re.findall(r'\b[A-Za-z]+\b', text)
    return words

# Function to convert words list into paragraphs
def convert_to_paragraphs(words, max_words_per_paragraph=10):
    paragraphs = []
    paragraph = ""
    for word in words:
        # Append word to paragraph
        paragraph += word + " "
        # Check if paragraph length exceeds max_words_per_paragraph
        if len(paragraph.split()) >= max_words_per_paragraph:
            # Add paragraph to the list and reset paragraph
            paragraphs.append(paragraph.strip())
            paragraph = ""
    # Add any remaining words as the last paragraph
    if paragraph:
        paragraphs.append(paragraph.strip())
    return paragraphs

# Function for recursive crawling with depth and URL limits
def crawl(url, domain, visited, max_depth=3, max_urls=100):
    if url in visited or len(visited) >= max_urls:
        return
    visited.add(url)
    
    print(f"Crawling: {url}")
    
    try:
        # Send a GET request to the URL
        response = requests.get(url)
        # Extract text from HTML content
        html_content = response.content
        # Extract only English words from the text
        words = extract_english_words(html_content.decode('utf-8'))
        # Save document with source URL
        save_document(url, words)
        
        # Extract links from the page
        soup = BeautifulSoup(html_content, 'html.parser')
        links = soup.find_all('a', href=True)
        for link in links:
            href = link['href']
            # Make the URL absolute
            absolute_url = urljoin(url, href)
            # Check if the link is within the same domain
            if urlparse(absolute_url).netloc == domain:
                # Check if the depth limit is reached
                if max_depth > 1:
                    crawl(absolute_url, domain, visited, max_depth-1, max_urls)
    except Exception as e:
        print(f"Error crawling {url}: {e}")

# Function to save document with source URL
def save_document(url, words):
    # Extract domain name from URL
    domain_name = urlparse(url).netloc
    # Create folder if it doesn't exist
    folder_name = f"{domain_name}_documents"
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
    # Generate file name from URL
    file_name = f"{folder_name}/{url.replace('/', '_')}.txt"
    # Write words to file
    with open(file_name, 'w') as f:
        f.write('\n'.join(words))

# Starting URL
start_url = 'https://ait.ac.th/'

# Extract domain from the starting URL
domain = urlparse(start_url).netloc

# Set of visited URLs to avoid revisiting the same page
visited = set()

# Start crawling with depth and URL limits
crawl(start_url, domain, visited, max_depth=3, max_urls=100)

print("Documents saved in separate folders.")


Crawling: https://ait.ac.th/
Crawling: https://ait.ac.th/ait-boi-science-and-technology-park/
Crawling: https://ait.ac.th/contact/
Crawling: https://ait.ac.th/alumni/
Crawling: https://ait.ac.th/about/
Crawling: https://ait.ac.th/about/facts-and-figures/
Crawling: https://ait.ac.th/about/rankings/
Crawling: https://ait.ac.th/about/leadership/
Crawling: https://ait.ac.th/about/meet-our-faculty/
Crawling: https://ait.ac.th/about/meet-our-staff/
Crawling: https://ait.ac.th/about/location/
Crawling: https://ait.ac.th/academics/calendar/
Crawling: https://ait.ac.th/academics/programs/
Crawling: https://ait.ac.th/academics/study-options/
Crawling: https://ait.ac.th/academics/student-opportunities/
Crawling: https://ait.ac.th/academics/schools/
Crawling: https://ait.ac.th/admissions/
Crawling: https://ait.ac.th/eligibility/
Crawling: https://ait.ac.th/financial-aid/
Crawling: https://ait.ac.th/tuition-and-fees/
Crawling: https://ait.ac.th/student-housing/
Crawling: https://ait.ac.th/apply-onl