In [1]:
import requests
import re
import csv
import time
import random as rd
from bs4 import BeautifulSoup
from tqdm import tqdm

In [13]:
def get_texts_by_read_link(link):
    """
    given the link to the page wehere everything is discussed returns
    a list of texts that represent the discussion, but not yet preprocessed
    """
    #getting html
    main_link = f'https://www.e1.ru/talk/forum/{link}'
    #setting sleep time so that it does not look like an attack
    time.sleep(rd.randint(3,8))
    
    html_response = requests.get(main_link,timeout=30)
    html_soup = BeautifulSoup(html_response.text,'html.parser')
    
    #the text discussions are included in tables
    all_tables = html_soup.find_all('table')
    discussion_tables = [table for table in all_tables if table.find_all('b',string = 'Автор:')]
    
    #getting the texts out of discussions
    the_texts_list = []
    for one_table in discussion_tables:
        the_text = one_table.text
        #start from where the date begins
        from_ = re.search(r'Дата.*\n',the_text).span()[1]
        text_to_save = the_text[from_:]
        the_texts_list.append(text_to_save)

    return the_texts_list
    


In [12]:
#checking
res = get_texts_by_read_link('read.php?f=58&i=657840&t=657840')
res

['            В России большинство, наверно, считают что работает "право сильного" в геополитике: если например Россия побеждает в войне Украину, то это является аргументом, что украинский майдан - дело дохлое. В этой связи хочу напомнить фрагмент фильма "Кин-Дза-Дза":\n\n\n\r\nЭто \n очень актуальная тема: в будущем наверняка будет придумано супероружие, с которым страна, которая нападает первой, всегда выигрывает. Вам не кажется, что человечество должно строго запретить большие войны, наказывать тех кто их начинает, а иначе оно просто не проживёт дольше 300 лет?        \n\n\n\n1/16\xa0|\xa0\n\n\xa0\xa0|\xa0\xa0Поделиться:\xa0\xa0\n\n\n\n\n\n\n\n                        Re: "Право сильного"\xa0\n                    \n\xa0#657843\xa0\n        \n\nнаверх\n\n\n\n\n\n\n\nАвтор:\xa0AKR_\xa0\xa0\xa0(О пользователе) \nДата:\xa0\xa0\xa011 марта 2024 12:08\n            Поддержу.\r\nСрочно запретите Америку! \r\nИ в Спортлото отпишитесь. \r\nТам входящие регистрируют.        \n\n\n\n16/0\xa0|\xa

In [18]:
def get_hrefs(link):
    """
    the function returns a dictionary with keyes read and next
    the read key contains a list with discussion pages and
    the next key contains a link to next page
    """
    #getting parsed html and a tags that contain hrefs
    #same here with time
    time.sleep(rd.randint(2,5))
    
    main_link = requests.get(f'https://www.e1.ru/talk/forum/{link}',timeout=20)
    html_parsed = BeautifulSoup(main_link.text,'html.parser')
    a_tags = html_parsed.find_all('a')
    #creating a dictionary
    main_dict = {'read':[],'next': None}
    #looping
    for one_a in a_tags:
        id_ = one_a.get('id')
        href_ = one_a.get('href')

        if re.match('read',href_):
            main_dict['read'].append(href_)

        if id_ == 'next_page':
            main_dict['next'] = 'list.php' + href_

    return main_dict


In [17]:
#checking
get_hrefs('list.php?f=58&t=2')

{'read': ['read.php?f=58&i=603228&t=603228',
  'read.php?f=58&i=575001&t=575001',
  'read.php?f=58&i=575001&t=575001&page=0',
  'read.php?f=58&i=575001&t=575001&page=1',
  'read.php?f=58&i=575001&t=575001&page=2',
  'read.php?f=58&i=575001&t=575001&page=3',
  'read.php?f=58&i=575001&t=575001&page=4',
  'read.php?f=58&i=575001&t=575001&page=5',
  'read.php?f=58&i=575001&t=575001&page=6',
  'read.php?f=58&i=572148&t=572148',
  'read.php?f=58&i=572148&t=572148&page=0',
  'read.php?f=58&i=572148&t=572148&page=1',
  'read.php?f=58&i=572148&t=572148&page=2',
  'read.php?f=58&i=572148&t=572148&page=3',
  'read.php?f=58&i=572148&t=572148&page=4',
  'read.php?f=58&i=572148&t=572148&page=14',
  'read.php?f=58&i=572148&t=572148&page=15',
  'read.php?f=58&i=572148&t=572148&page=16',
  'read.php?f=58&i=572148&t=572148&page=17',
  'read.php?f=58&i=572148&t=572148&page=18',
  'read.php?f=58&i=546069&t=546069',
  'read.php?f=58&i=477132&t=477132',
  'read.php?f=58&i=477132&t=477132&page=0',
  'read.ph

In [19]:
def get_classes_with_links():
    """
    return the main classes that we want to analyze along 
    with the links to the first page with themes 
    it is returned in a tuple with elements of the form (class,link_to_first_page)
    """
    #get main page (we know already the link)
    main_html = requests.get('https://www.e1.ru/talk/forum/index.php?f=48')
    soup = BeautifulSoup(main_html.text,'html.parser')

    #get table rows that contain our classes
    list_of_trs = soup.find_all('tr',class_ = 'backgroundLight')
    #get hrefs and class names
    list_of_a = [one_tag.a for one_tag in list_of_trs]
    #getting ready the list to return
    ret_list = [(the_a.text,the_a.get('href')) for the_a in list_of_a]
    return ret_list

In [20]:
#check the starting point
start = get_classes_with_links()
start

[('Общество', 'list.php?f=60'),
 ('Растения', 'list.php?f=122'),
 ('Политика', 'list.php?f=58'),
 ('Семейные отношения', 'list.php?f=97'),
 ('Правовые вопросы', 'list.php?f=77'),
 ('Женский клуб', 'list.php?f=85'),
 ('Мужской клуб', 'list.php?f=87'),
 ('Дети и Родители', 'list.php?f=61'),
 ('Творчество', 'list.php?f=144')]

In [21]:
#creating a csv file, to which we want to load the data
with open('data.csv','w') as csv_file:
    writer = csv.writer(csv_file,delimiter=';')
    writer.writerow(['Text','Class'])

In [22]:
def load_texts(path = 'data.csv'):
    """
    The necessary text with their classes to a csv_file
    """
    #setting the limit
    texts_limit = 5556
    #looping through the main classes and their links
    for class_,enter_link in tqdm(start,desc='Getting texts'):
        #needed parameters
        count = 0
        current_link = enter_link
        #adding texts of class until limit
        while count < texts_limit:
            #getting the dict with read and next pages
            path_dict = get_hrefs(current_link)
            #for each link that leads to discussions
            for read_link in path_dict['read']:
                all_texts = get_texts_by_read_link(read_link)
                #open the file to write data to it
                with open(path,'a') as csv_file:
                    writer = csv.writer(csv_file,delimiter=';')
                    #write each particular text data with its class
                    for one_text in all_texts:
                        writer.writerow([one_text,class_])
                        #added one row
                        count += 1
            #move to the next page
            current_link = path_dict['next']

In [23]:
#getting data
load_texts()

Getting texts: 100%|██████████| 9/9 [6:14:57<00:00, 2499.71s/it]  
