In [1]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
from lxml import html
import re
import csv
import pandas as pd

In [2]:
def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None
    
def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)

def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)

In [3]:
def get_all_links(soup, homepage):
    pages = []
    for item in soup.find_all('a'):
        page = item.get('href')
        if page[:3] == '../':
            page = homepage + page[3:]
        pages.append(page)
    return list(set(pages))

In [4]:
def get_url_info(url):
    homepage = 'http://' + url
    raw_html = simple_get(homepage)
    
    soup = BeautifulSoup(raw_html, 'html.parser')
    
    title = ""
    
#     print(text)
#     print(html.title)
#     print(html.get_text())
#     print(html.find_all('a'))
    
    pages = get_all_links(soup, homepage)
    
    return pages

In [5]:
urls = get_url_info("www.baeckerei-grobe.de/")

for url in urls:
    print(url)

https://www.umwelt.nrw.de/landwirtschaft/ernaehrungswirtschaft/meisterwerknrw-ehrenpreis-des-landes-nrw/
http://www.baeckerei-grobe.de/pages/aktuelles.php?id=141
http://www.baeckerei-grobe.de/pages/aktuelles.php?id=143
http://www.baeckerei-grobe.de/pages/produkte.php
http://www.baeckerei-grobe.de/pages/grobe.php?id=2
http://www.baeckerei-grobe.de/pages/grobe.php?id=1
http://www.baeckerei-grobe.de/pages/kontakt.php
http://www.baeckerei-grobe.de/pages/impressum.php
http://www.baeckerei-grobe.de/pages/grobe.php?id=11
http://www.baeckerei-grobe.de/pages/aktuelles.php
http://www.baeckerei-grobe.de/pages/grobe.php?id=5
http://www.baeckerei-grobe.de/pages/aktuelles.php?id=113
https://www.facebook.com/Baeckermeister-Grobe-GmbH-CoKG-231016537300128/
http://www.baeckerei-grobe.de/pages/aktuelles.php?id=132
https://www.innungsbaecker.de/baeckerfinder/baecker/c2d3a2cd-915c-4f93-89ef-ae33c91d1e04/
/
http://www.baeckerei-grobe.de/pages/grobe.php?id=3
http://www.baeckerei-grobe.de/pages/grobe.php?id=

In [6]:
def get_company_urls(csv):
    company_urls = []
    
    data = pd.read_csv('20181018 Domain+WZ2008 Hackathon INOBAS.csv', sep=';', encoding='latin1', names=['Domain', 'Section', 'Code'])
    
    for index, row in data.iterrows():
        company_urls.append(row['Domain'])
        
    return company_urls

In [7]:
def check_string_in_list(string, arr):
    arr = list(set(arr))
    
    for item in arr:
        if string in item:
            return item

In [8]:
def get_company_sites(str_list, pages):
    sites = []
    
    for item in str_list:
        # Problem iterating 3 times TODO
        site = check_string_in_list(item, pages)
        sites.append(site)
    
    return sites

In [9]:
baeckerei_grobe = get_company_sites(['facebook', 'impressum'], urls)
print(baeckerei_grobe)

['https://www.facebook.com/Baeckermeister-Grobe-GmbH-CoKG-231016537300128/', 'http://www.baeckerei-grobe.de/pages/impressum.php']
