In [31]:
!pip install psycopg2-binary



In [32]:
import requests
import pandas as pd
import psycopg2
import os 
from bs4 import BeautifulSoup
from sqlalchemy import create_engine

In [33]:
def grab_meshok_ru(start_url):

    extracted_pictures = set()
    page_number = 0
    while True:
        resp = requests.get(encode_page_number_meshok_ru(start_url, page_number))
        if resp.status_code != 200:
            break
        
        soup = BeautifulSoup(resp.content, 'html.parser')
                
        page_pictures = parse_page(soup)
        if page_pictures:
            extracted_pictures.update(page_pictures)
            page_number +=1
        else:
            break
    
    return extracted_pictures

def encode_page_number_meshok_ru(start_url, page_number):
    return start_url + f"&pN={page_number*200}"

def parse_page(soup):
    
    extracted_pictures = set()
    page_anchors = soup.find_all('a')
    for anchor in page_anchors:
        auction_url = None
        picture_url = None
        picture_desc = None
        if 'class' in anchor.attrs  and 'clk' in anchor.attrs['class']:
            url_underbar_position = anchor.attrs['href'].index("_")
            auction_url = anchor.attrs['href'][:url_underbar_position if url_underbar_position != -1 else None]

            for ch in anchor.children:
                if 'class' in ch.attrs  and 'lId' in ch.attrs['class']:
                    for chch in ch.children:
                        picture_url = chch["data-src"]
                        break
                
                if 'class' in ch.attrs  and 'liN' in ch.attrs['class']:
                    picture_desc = next(iter(ch.children))
                    break

        if auction_url and picture_url and picture_desc:
            extracted_pictures.add((auction_url, picture_url, picture_desc))
            
    return extracted_pictures


In [34]:
URL = "https://meshok.net/listing?good=760&pp=200&sort=end_date&user=589114&way=asc"
extracted_pictures = grab_meshok_ru(URL)
print(f"{len(extracted_pictures)} pictures extracted from the web-site")

grabbed_imgs_df = pd.DataFrame(data=extracted_pictures, columns=["id", "picture_url", "description"])
grabbed_imgs_df.to_csv(
    '~/work/data/grabbed_images.csv',
    header=False, index=True
)

934 pictures extracted from the web-site


In [37]:
DB_IP =  os.environ.get('DB_IP')
DB_PORT = os.environ.get('DB_PORT') 
DB_USER = os.environ.get('DB_USER')
DB_PWD = os.environ.get('DB_PWD')
DB_DBNAME = os.environ.get('DB_DBNAME')
conn_string = f"postgresql://{DB_USER}:{DB_PWD}@{DB_IP}:{DB_PORT}/{DB_DBNAME}"
db_table_name = 'post_cards'

engine = create_engine(conn_string)
number_appended_records = grabbed_imgs_df.loc[:, ["picture_url", "description"]].set_index("picture_url").to_sql(
    'post_cards', 
    engine, 
    if_exists='replace', 
    index_label="picture_url"
)

print(f"{number_appended_records} records saved to database")

934 records saved to database


In [38]:
# lets see the data
conn = psycopg2.connect(f"host={DB_IP} port={DB_PORT} dbname={DB_DBNAME} user={DB_USER} password={DB_PWD}")
cur = conn.cursor()
sel_query_sql = f'Select  *  from {db_table_name}'
sel_result  = cur.execute(sel_query_sql)
for record in cur.fetchmany(1000):
    print(record)
conn.close()


('https://b.itemimg.com/i/289543135.0.jpg', 'Фотооткрытка. Русская авиация «Коломяги, ипподром. 1909 г. Альбер Гюйо на моноплане ‘’Блерио’’». RR.')
('https://b.itemimg.com/i/289361195.0.jpg', 'Открытка «Нерчинская каторга. Тюрьма на Карийских золотых промыслах». № 2. Изд. А. К. Кузнецова.')
('https://b.itemimg.com/i/289473252.0.jpg', 'Открытка «Ляоян». Изд. А. П. Самарина, Харбин, Манджурия.')
('https://b.itemimg.com/i/290854777.0.jpg', 'Фотооткрытка «Князь Майсурадзе. Борец».')
('https://b.itemimg.com/i/289862060.0.jpg', 'Открытка. Худ. Соломко С. «Трудный ответ». № 1634. Изд.  И. Лапин, Париж. Золотой обрез.')
('https://b.itemimg.com/i/289934341.0.jpg', 'Открытка «Москва Храм Христа Спасителя». № 125. Изд. П. Фон - Гиргенсон.')
('https://b.itemimg.com/i/290345293.0.jpg', 'Фотооткрытка «Ленинград. Нева и Исакиевский собор».')
('https://b.itemimg.com/i/289737424.0.jpg', 'Открытка «Данков. Базарная площадь и присутственные места». Липецкая губерния. Изд. Д. А. Иванова.')
('https://b.ite