In [1]:
from bs4 import BeautifulSoup
import requests
import re
import random
import numpy as np

In [85]:
def translate_latex(latex):
    patterns = [
        (r"\$(.*?)\$", lambda match: match.group(1)),  # Extract content between dollar signs
        (r"\\[a-zA-Z]+\{([^\}]*)\}", lambda match: match.group(1)),  # Remove backslash and braces around commands
        (r"\^\{(.*?)\}", lambda match: f"^{match.group(1)}"),  # Handle superscript
        (r"_\{(.*?)\}", lambda match: f"_{match.group(1)}"),  # Handle subscript
        (r"\$(.*?)\$", lambda match: match.group(1)),  # Extract content between dollar signs
        (r"\\([a-zA-Z]+)", lambda match: match.group(1)),  # Remove backslash from commands
        (r"\{([^}]*)\}", lambda match: match.group(1)),  # Remove braces around arguments
        (r"\^\{(.*?)\}", lambda match: f"^{match.group(1)}"),  # Handle superscript
        (r"_\{(.*?)\}", lambda match: f"_{match.group(1)}"),  # Handle subscript
    ]
    
    for pattern, repl in patterns:
        latex = re.sub(pattern, repl, latex)
    
    return latex

def cross_val_parser(URL):
    parsed = []
    response = requests.get(url=URL,)
    soup = BeautifulSoup(response.content, 'html.parser')

    question = soup.find_all('div',{'id':'question-header'})[0].find_all('h1')[0].get_text()
    parsed.append(translate_latex(question))
    question_body = soup.find_all('div',{'class':'s-prose js-post-body'})[0].find_all('p')

    question_body_ = ''
    for paragraph in question_body:
        question_body_ += paragraph.get_text()
    parsed.append(translate_latex(question_body_))

    answers = []
    try:
        answers_ = soup.find_all('div',{'id':'answers'})[0]
        main_answer = soup.find_all('div',{'id':'answers'})[0].find_all('div',{'class':'answer js-answer accepted-answer js-accepted-answer'})[0].find_all('div',{'class':'s-prose js-post-body'})[0].find_all('p')[0].get_text()
        answers.append(translate_latex(main_answer))

        other_answers = soup.find_all('div',{'id':'answers'})[0].find_all('div',{'class':'answer js-answer'})
        for element in other_answers:
            body = element.find_all('div',{'class':'s-prose js-post-body'})[0]
            answer = ''
            for sub in body:
                answer += sub.get_text()
            answers.append(translate_latex(answer))
        
        parsed.append(answers)
    except KeyError:
        pass
    
    return parsed

In [103]:
def cross_val_crawler(URL):
    urls = []
    domain = 'https://stats.stackexchange.com'
    response = requests.get(url=URL,)
    soup = BeautifulSoup(response.content, 'html.parser')
    try:
        links = soup.find_all('div',{'class':'module sidebar-related'})[0].find_all('div',{'class':'related js-gps-related-questions'})[0].find_all('div',{'class':'spacer'})
        for link in links:
            url = link.find_all('a')
            urls.append(domain+str(url[1]['href']))
    except KeyError:
        pass
    return urls
    
def cross_val_surfer(init, trigger):
    output = [cross_val_parser(init)]
    urls = set(cross_val_crawler(init))
    while len(output) < trigger and len(urls) > 0:
        url = list(urls).pop(0)
        output.append(cross_val_parser(url))
        urls.union(set(cross_val_crawler(url)))
    return output