In [None]:
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time

# Укажите путь к Edge WebDriver
edge_service = Service(r"C:\Users\asus\Downloads\edgedriver_win64 (1)\msedgedriver.exe")

# URL для сбора данных
base_url = "https://www.nur.kz/latest/"

# Словарь для преобразования реакций
reaction_types = {
    "thumbs-up": "Like",
    "red-heart": "Love",
    "grinning-face-with-smiling-eyes": "Joy",
    "face-with-open-mouth": "Surprise",
    "crying-face": "Sadness",
    "enraged-face": "Anger"
}

def filter_reactions(reactions):
    """
    New logic for filtering and sorting reactions:
    1. Ignore the "Love" reaction completely.
    2. Group and sum all valid reactions except "Love".
    3. Sort reactions by count in descending order.
    4. The first emotion is the one with the highest count (primary emotion),
       followed by others in decreasing order.
    """
    filtered_reactions = {}

    for reaction, count in reactions:
        if reaction == "Love":
            continue  # skip "Love"
        filtered_reactions[reaction] = filtered_reactions.get(reaction, 0) + count

    sorted_emotions = sorted(
        filtered_reactions.items(),
        key=lambda x: (-x[1], x[0])  # by descending count, then alphabetically
    )

    return [emotion for emotion, _ in sorted_emotions]


def main(base_url):
    driver = webdriver.Edge(service=edge_service)
    driver.get(base_url)

    collected_data = {}
    record_id = 0
    scroll_pause_time = 2  # Время ожидания после прокрутки
    scroll_limit = 1  # Лимит прокруток

    # Прокрутка вниз страницы 100 раз
    for _ in range(scroll_limit):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(scroll_pause_time)

    # Извлечение карточек
    soup = BeautifulSoup(driver.page_source, 'lxml')
    cards = soup.find_all('li', class_="latest-news__article")

    if not cards:
        print("Данных для извлечения нет.")
    else:
        for card in cards:
            try:
                # Извлекаем заголовок
                headline_element = card.find("a", class_="article-card__title js-article-link")
                if not headline_element:
                    continue
                headline = headline_element.text.strip().replace('(фото)', '').replace('(видео)', '').strip()

                # Пропускаем категории "Калейдоскоп"
                category_element = card.find("span", class_="article-card__category")
                category = category_element.text.strip() if category_element else "Неизвестно"
                if category.lower() == "калейдоскоп":
                    continue

                # Извлекаем ссылку на новость
                news_link = headline_element['href']
                record_id += 1
                collected_data[record_id] = {"Жаналык": headline, "Категория": category}

                # Открываем новость в новой вкладке
                driver.execute_script("window.open(arguments[0]);", news_link)
                driver.switch_to.window(driver.window_handles[1])
                time.sleep(3)

                # Сбор текста статьи
                inner_soup = BeautifulSoup(driver.page_source, 'lxml')
                news_text_elements = inner_soup.find_all('p', class_='align-left formatted-body__paragraph')
                news_text = ' '.join(p.get_text().strip() for p in news_text_elements) if news_text_elements else "Текст отсутствует"
                collected_data[record_id]["Текст"] = news_text

                # Сбор реакций
                reactions_div = inner_soup.find('div', class_="article-reactions-list-module__list--3z2jO article-reactions-list-module__primary--3HbSH")
                reactions = []
                if reactions_div:
                    reaction_buttons = reactions_div.find_all('button', class_="article-reaction-module__button--1fFZ4")
                    for button in reaction_buttons:
                        img = button.find('img')
                        count = button.text.strip()
                        if img and count.isdigit() and int(count) > 0:
                            reaction_img = img['src'].split("/")[-1].split(".")[0]
                            reaction_type = reaction_types.get(reaction_img, "Unknown")
                            if reaction_type != "Unknown":
                                reactions.append((reaction_type, int(count)))

                if reactions:
                    filtered_reactions = filter_reactions(reactions)
                    collected_data[record_id]["Реакции"] = ", ".join(filtered_reactions)

                # Закрываем вкладку с новостью и возвращаемся на главную страницу
                driver.close()
                driver.switch_to.window(driver.window_handles[0])
                print(f"Собрано: {headline} с реакциями {collected_data[record_id].get('Реакции', 'Нет реакций')}")

                time.sleep(2)

            except Exception as e:
                print(f"Ошибка при обработке карточки: {e}")

        # Сохранение данных в файл
        with open('dataset_news_reaction.txt', 'a', encoding='utf-8') as file:
            for key, value in collected_data.items():
                file.write(f'{key}, {value}\n')
            print("Данные сохранены.")

    driver.quit()

main(base_url)
