### All Data Scraper ( BS4 and Selenium Combine )

In [11]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import re
import pandas as pd 
import pickle
from time import time,sleep
from warnings import filterwarnings
filterwarnings("ignore")

In [40]:
class BinaAZScraper():
    
    """ Please change chromedriver.exe path when pull project to computer"""
    
    headers={"User-Agent":"Chrome/100.0.4896.12"}
    paths={
    'driver_path':'C:/Users/Ali Kalbaliyev/Desktop/PMD/chromedriver.exe', #changeableside
    'number_xpath':'//*[@id="show-phones"]',
    'sale_xpath':'//*[@id="js-main-col"]/div[2]/div[1]/div/a[3]'
    }
    
    def __init__(self,url):
        self.url=url
        self.announcement_links=[]
        self.data=[]
        
    def getDataFromBina(self):
        start=time()
        
        page=requests.get(self.url,headers=self.headers)
        print(page)
        soup=BeautifulSoup(page.text,"html.parser")
        announcements=soup.findAll('div',class_="items_list")
        if announcements:
            link_tags=list(set(announcements[2].findAll("a",class_="item_link")))
            for link in link_tags:
                if "items" in link.get("href"):
                    self.announcement_links.append("https://bina.az"+link.get("href"))
            next_button_url=soup.find('a',attrs={"rel":"next"})
            if next_button_url:
                next_button_url="https://bina.az"+next_button_url.get("href")
                print(f'Second: {time()-start}\nTotal Links\' size:',len(self.announcement_links),"\nScraped Url:",next_button_url,"\n")
                return self.getDataFromBina(next_button_url)
        print("End of Scraping")
        return list(set(self.announcement_links))

    def saveLinkData(self,data,name):
        pickle.dump(data, open(f"{name}.pkl", "wb"))
        
    #Selenium Side
    def seleniumActivate(self,url,xpath_value):
        self.driver = webdriver.Chrome(self.paths['driver_path'])
        self.driver.get(url)
        next_button = self.driver.find_element(by='xpath',value=xpath_value)
        next_button.click()
        sleep(1)
        return self.driver
    
    def getSourceFromSelenium(self,url,xpath_value):
        self.seleniumActivate(url,xpath_value)
        self.soup = BeautifulSoup(self.driver.page_source, "html.parser")
        self.driver.close()
        return self.soup
    
    def parseData(self,src):
        # Agency 
        table={el.findAll('td')[0].text:el.findAll('td')[1].text for el in src.find('table',class_='parameters').findAll('tr')}
        agency_name=src.find('h1',class_='agency--title')
        if agency_name:
            table['Agency Name']=agency_name.text
            agency_url="https://bina.az"+src.find('a',class_='agency--offers-count').get('href')
            table['Count of New buildings of Agency']=re.search(r'\d+',getSourceFromSelenium(agency_url,paths['sale_xpath']).find('a',class_='agencies-row--show-all js-agencies-row-show-group').text).group(0)

        table['Price']=src.find('span',class_='price-val').text
        table['Currency']=src.find('span',class_='price-cur').text
        table['Unit Price']=src.find('div',class_='unit-price').text

        src=src.find('div',class_='item_show_content')

        table['Description']=src.find('article').text
        #Map And Locations
        table['Adress']=src.find('div',class_='map_address').text
        table['Locations']=[el.text for el in src.find('ul',class_='locations').findAll('li')]

        map_info=src.find('div',id='item_map')
        table['Latitute']=map_info.get('data-lat')
        table['Longitude']=map_info.get('data-lng')

        #Seller
        table['Seller Name']=src.find('div',class_='name').find(text=True)
        table['Ownership']=src.find('span',class_='ownership').text
        table['Phone Numbers']=[el.text for el in src.find('div',class_='js-phones').findAll('li')]

        #Announce Info
        item_info=src.find('div',class_='item_info').findAll('p')

        table['Announcement ID']=item_info[0].text
        table['Views']=item_info[1].text
        table['Update Time']=item_info[2].text
        return table
    
    def saveDataSet(self,name):
        df=pd.DataFrame(self.data)
        df.to_csv(f'{name}.csv',index=None)

#### Create object

In [38]:
binaSC=BinaAZScraper("https://bina.az/baki/alqi-satqi/menziller/yeni-tikili")

### Announcements' Urls

In [None]:
linkdata=binaSC.getDataFromBina()
binaSC.saveLinkData(linkdata,'announcement_links')

### Announcements' Data

In [None]:
urls=pickle.load(open("announcement_links.pkl",'rb'))
counter=1
for url in urls:
    start=time()
    source=binaSC.getSourceFromSelenium(url,binaSC.paths['number_xpath'])
    binaSC.data.append(binaSC.parseData(source))
    print(f'Second: {time()-start}\nAnnouncement:{counter}\nScraped Url:{url}\n')
    counter+=1
    
binaSC.saveDataSet('binadata')