In [1]:
import warnings
warnings.filterwarnings('ignore')
import scrapy
from scrapy.crawler import CrawlerProcess
import re
import pandas as pd
from urllib.parse import urlparse, parse_qs, urlencode, urlunparse


In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess
import re


def parse_word(response):
    word = response.meta['word']

    # Extracting lexicons
    word['lexicon_0'] = response.xpath('//body/br[2]/following-sibling::text()').get()
    if word['lexicon_0'] and word['lexicon_0'].strip() == '':
        word['lexicon_0'] = response.xpath('//body/br[3]/following-sibling::text()').get()

    for i in range(1, 5):
        lex = response.xpath(f'//body/hr[{i}]/following-sibling::text()').get()
        if lex and lex.strip() != '':
            word[f'lexicon_{i}'] = lex.strip()
        else:
            word[f'lexicon_{i}'] = None

    # Check if the word is a verb (look for "verb" in lexicon_0, lexicon_1, lexicon_2, lexicon_3)
    verb_form = None
    for lexicon_key in ['lexicon_0', 'lexicon_1', 'lexicon_2', 'lexicon_3']:
        if word.get(lexicon_key) and 'verb' in word[lexicon_key]:
            verb_form = word[lexicon_key].strip().split(' ')[2]  # Extract the form after 'verb'
            break

    # If a verb form is detected, extract the corresponding translations from <span class="mg1">
    if verb_form:
        # Extracting only the second and subsequent <span class="mg1">
        translation_list = response.xpath(
            f'(//span[@class="bin" and text()="{verb_form}"]/following-sibling::span[@class="mg1"])[position()>1]/text()').getall()

        # Clean up translations: filter out numbers and empty strings
        cleaned_translations = []
        for t in translation_list:
            t = t.strip()
            if t and not re.match(r'^\d+$', t):  # Exclude pure numbers
                cleaned_translations.append(t)

        # Add up to 4 cleaned translations to the word dictionary
        for i, translation in enumerate(cleaned_translations[:4]):
            word[f'meaning_{i}'] = translation

        # Fill any remaining meaning columns with None
        for i in range(len(cleaned_translations), 4):
            word[f'meaning_{i}'] = None
    else:
        # Extract multiple word meanings for non-verbs (base logic unchanged)
        meanings = response.xpath('//span[@class="mgP"]/text()').getall()
        for i, meaning in enumerate(meanings):
            if i < 4:
                word[f'meaning_{i}'] = meaning.strip()

        # Fill any remaining meaning columns with None
        for i in range(len(meanings), 4):
            word[f'meaning_{i}'] = None

    yield word


class CalWordSpider(scrapy.Spider):
    name = 'cal'
    start_urls = [
        'https://cal.huc.edu/get_a_chapter.php?file=71009'
    ]

    def parse(self, response):
        for el in response.css('tr > td:nth-child(2) > a'):
            word = {
                'text': el.css('::text').get(),
                'url': el.xpath('@href').get()
            }
            yield scrapy.Request(
                url=f"https://cal.huc.edu/{word['url']}",
                meta={'word': word},
                callback=parse_word
            )


# Run the spider
process = CrawlerProcess(settings={
    'FEEDS': {
        'Masekhet_tan_checkk.json': {
            'format': 'json',
            'overwrite': True
        }
    }
})

process.crawl(CalWordSpider)
process.start()