In [None]:
import requests
import re

import pandas as pd
from tqdm.auto import tqdm
from datetime import datetime
from fake_useragent import UserAgent
from bs4 import BeautifulSoup as BS
import numpy as np
import time


# Чтение файла с ссылками на объявления

In [None]:
import glob
import os

search_dir = "urls/"
files = list(filter(os.path.isfile, glob.glob(search_dir + "*")))
files.sort(key=lambda x: os.path.getmtime(x))
files

In [None]:
all_url = pd.read_parquet(files[-1])
all_url

# Запрос по ссылке и преобразование BS

In [None]:
def get_page(url):
    ua = UserAgent()
    header = {'User-Agent':str(ua.safari)}
    page = requests.get(url, headers=header)
    soup_one = BS(page.text, "html.parser")
    return page, soup_one

# Получение фото из полного текста страницы с помощью регулярки

In [None]:
def get_photos(page):
    reg = r'photos..(\[.+?\])'
    list_url_img = eval(re.findall(reg,page.text)[0].replace('false','False').replace('true','True'))


    list_img = ['https://img.dmclk.ru/s1200x800q80'+f['url'] for f in list_url_img]
    df_urls_imgs = pd.DataFrame(list_img).T
    new_col = [f'img_url_{i}' for i in df_urls_imgs.columns]
    df_urls_imgs.columns = new_col
    return df_urls_imgs

# Получение основной информации из объявления


In [None]:
def get_main_info(page, soup_one):
    title = soup_one.find('h1', {'id':'title'}).text
    price = int(soup_one.find('div', {'class':'fsnok'}).text.replace('\xa0','').replace('₽','').replace(' ',''))
    m2_price = int(soup_one.find('div', {'class':'dhZVF'}).text.replace('\xa0','').replace('₽/м²','').replace(' ',''))
    sqare = float(soup_one.find('div', {'class':'SHrv3'}).text.replace('\xa0','').replace('м²','').replace(' ',''))

    side_bar = soup_one.find('aside', {'class':'BstLn'})

    #Имя агенства
    agent = side_bar.find('div', {'class':'kkC9a'})
    if agent:
        agent = agent.text
    name = side_bar.find('div', {'class':'aO9sI'})
    #имя продавца
    if name:
        name = name.text
    else:
        name = soup_one.find('div', {'class':'ZZ8U1'}).text
    # кол-во объявлений
    count_ord = side_bar.find('span', {'class':'w6bQG'})
    if count_ord:
        count_ord = count_ord.text

    period = side_bar.find('div', {'class':'mGakq s468K'})
    if period:
        period = period.text

    confirmed_sber = side_bar.find('div', {'class':'zrsCb'})
    if confirmed_sber:
        confirmed_sber = confirmed_sber.find('div', {'class':'icon-root-4-1-2'})['style']
        if confirmed_sber == 'color:#53B374':
            confirmed_sber = 1
        else:
            confirmed_sber = -1

    posted = soup_one.find('div', {'class':'hfJ0+'}).text
    address = soup_one.find('span', {'class':'S8rOu'}).text

    transport_points = soup_one.find('div', {'data-e2e-id':'transport-points'})
    metro_name_dist = {}
    if transport_points:
        metro_name = [m.text for m in transport_points.find_all('a',{'class':"qWfOS JWz6L"} )]
        metro_dist = [m.text for m in  transport_points.find_all('span',{'data-e2e-id':"time-on-foot"})]
        metro_name_dist = dict(zip(metro_name,metro_dist))


    reg_longitude = r'"longitude":(.+?)}'
    reg_latitude = r'"latitude":(.+?),'
    coordinate = dict(zip(['longitude','latitude'],
                          re.findall(reg_longitude,page.text) + re.findall(reg_latitude,page.text)
                         ))
    # score = soup_one.find('span', {'class':'Ucyv6'})
    # if score:
    #     score = score.text



    description = soup_one.find('div', {'id':'description'}).text

    # dict_price_hist = {}
    # if soup_one.findAll('tr', {'class':'price-history__row'}):
    #     for row in soup_one.findAll('tr', {'class':'price-history__row'}):
    #         date = row.find_all('td', {'class':'price-history__col'})[0].text


    #         price = row.find_all('td', {'class':'price-history__col'})[1].text
    #         if row.find_all('td', {'class':'price-history__col'})[2]['class'][-1].endswith('more'):
    #             change = '+' +row.find_all('td', {'class':'price-history__col'})[2].text
    #         elif row.find_all('td', {'class':'price-history__col'})[2]['class'][-1].endswith('less'):
    #             change = '-' +row.find_all('td', {'class':'price-history__col'})[2].text
    #         else:
    #             change = row.find_all('td', {'class':'price-history__col'})[2].text

    #         dict_price_hist[date] = {}
    #         dict_price_hist[date]['price'] = price
    #         dict_price_hist[date]['change'] = change
    
    
    unpublished = soup_one.find('div', {'data-e2e-id':'offer-unpublished'})
    if unpublished:
        unpublished = unpublished.text
    all_info = {'url':url,
                #'district':district,
                'title':title,
                'price':price,
                'm2_price':m2_price,
                'agent':agent,
                'name':name,
                'count_ord':count_ord,
                'period':period,
                'confirmed_sber':confirmed_sber,
                'posted':posted,
                'address':address,
                #'score':score,
                'description':description,
                'metro_name_dist':[metro_name_dist],
                'coordinate':[coordinate],
                #'price_history':[dict_price_hist],
                'unpublished':unpublished,
                }
    return all_info

In [None]:
def get_house_info(page, soup_one):
    dict_info_dom = {}
    for elem in soup_one.findAll('li', {'class':'ekQt7'}):
        dict_info_dom[elem.findAll('span')[0].text.replace('\xa0','')] = \
            elem.findAll('span')[-1].text.replace('\xa0',' ')
    return dict_info_dom

In [None]:
import linecache
import sys
def PrintException():
    exc_type, exc_obj, tb = sys.exc_info()
    f = tb.tb_frame
    lineno = tb.tb_lineno
    filename = f.f_code.co_filename
    linecache.checkcache(filename)
    line = linecache.getline(filename, lineno, f.f_globals)
    print( 'EXCEPTION IN (LINE {} "{}"): {}'.format(lineno, line.strip(), exc_obj) )
try:
    print (1/0)
except:
    PrintException()

In [None]:
df_all = pd.DataFrame()

In [None]:
if not os.path.exists('data'):
    os.makedirs('data')

In [None]:
count_url = 0
for index , row  in tqdm(all_url[:].iterrows(), total = len(all_url)):
    count_err = 1
    for _ in range(10):
        try:
            #1/0
            url = row.urls
            page, soup_one = get_page(url)

            df_urls_imgs = get_photos(page)

            all_info = get_main_info(page, soup_one)

            house_info = get_house_info(page, soup_one)
            
            all_info.update(house_info)

            df_add = pd.concat([pd.DataFrame(all_info, index = [0]), df_urls_imgs], axis = 1)
            df_all = pd.concat([df_all,df_add ] )
            df_all.reset_index(drop=True,inplace = True)
            #сохранения какждый 1000 объявлений
            if count_url%1000 == 0:
                df_all.to_parquet(f'data/all_part_{count_url}_{datetime.now().date()}.parquet')
            count_url+=1
            break
        except Exception as e: 
            print('-'*10,'error','-'*10)
            print('url: ',url)
            PrintException()
            print('count_err:',count_err)
            count_err+=1
            print(datetime.now())
            print('-'*25)
            continue
df_all.to_parquet(f'data/all_part_{count_url}_{datetime.now().date()}.parquet')

In [None]:
df_all