In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver

def extract_text(tag):
    return tag.text.strip() if tag else ''

def scrape_accessories(url):
    """Scrapes text from div elements with class 'acctitel noLeaf' from a webpage using Selenium.
    
    Args:
        url (str): The URL of the webpage to scrape.
    
    Returns:
        A list of strings containing the scraped text.
    """
    # Initialize the Chrome driver
    driver = webdriver.Chrome()
    accessories_text = []
    
    try:
        # Open the webpage
        driver.get(url)
        
        # Execute JavaScript to get the text of elements with class 'acctitel noLeaf'
        accessories_elements = driver.execute_script(
            "return [...document.querySelectorAll('div.acctitel.noLeaf')].map(e => e.innerText.trim());"
        )
        
        accessories_text.extend(accessories_elements)
                
    except Exception as e:
        print(f"Error encountered during scraping: {str(e)}")
        
    finally:
        # Close the WebDriver
        driver.quit()
        
    return accessories_text

def scrape_child_urls(parent_url):
    base_url = 'https://www.imm-cologne.com'  # Base URL of the website
    response = requests.get(parent_url)
    child_urls = []
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        # Find all div elements with class 'col col1ergebnis'
        div_elements = soup.find_all('div', class_='col col1ergebnis')
        # Extract the href attribute from the 'a' tag within each div element
        child_urls = [base_url + div.find('a')['href'] for div in div_elements if div.find('a')]
    else:
        print("Failed to retrieve the webpage. Status code:", response.status_code)
    
    return child_urls


def scrape_data_from_child_url(child_url):
    response = requests.get(child_url)
    data = {}
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        
        company_name_div = soup.find('div', class_='headline-title')
        company_name = extract_text(company_name_div.find('span')) if company_name_div else ''

#         company_address_div = soup.find('div', class_='location-info')
#         if company_address_div:
#             address_components = [element.strip() for element in company_address_div.stripped_strings]
#             company_address = ', '.join(address_components)
#         else:
#             company_address = ''

        company_address_div = soup.find('div', class_='location-info')
        if company_address_div:
            address_components = [element.strip() for element in company_address_div.stripped_strings]
            company_address = ', '.join(address_components)
            words = company_address.split()
            location = words[-1] if words else ''
        else:
            company_address = ''
            location = ''

        company_booth_div = soup.find('div', class_='asdb54-rawTextHallenStand')
        company_booth = extract_text(company_booth_div.find('div')).replace('\n', '').replace('\t', '') if company_booth_div else ''

        company_website_div = soup.find('div', class_='sico ico_link linkellipsis')
        company_website = extract_text(company_website_div.find('span')) if company_website_div else ''

        company_email_div = soup.find('div', class_='sico ico_email')
        company_email = extract_text(company_email_div.find('span')) if company_email_div else ''

        company_phone_div = soup.find('div', class_='sico ico_phone')
        company_phone = extract_text(company_phone_div)

        scraped_accessories = scrape_accessories(child_url)

        targeted_countries = [extract_text(div) for div in soup.find_all('div', class_='asdb54-singleInfo-gruppierung')]
        targeted_countries_filtered = targeted_countries[2:-1]


        data = {
            'Exhibitor Name': [company_name],
            'Exhibitor Address': [company_address],
            'Exhibitor Booth': [company_booth],
            'Exhibitor Website': [company_website],
            'Exhibitor Email': [company_email],
            'Exhibitor Phone': [company_phone],
            'Product Groups': [scraped_accessories],
            'Targeted Countries': [targeted_countries_filtered],
            'Location' : location
        }
        
    else:
        print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
    
    return data

def main():
    parent_url = 'https://www.imm-cologne.com/imm-cologne-exhibitors/list-of-exhibitors/'
    child_urls = []

    for page_number in range(1, 38):
        start_value = (page_number - 1) * 20
        #page_url = f"{parent_url}?route=aussteller/blaettern&&start={page_number * 15}&paginatevalues=%7B%22stichwort%22%3A%22%22%2C%22suchart%22%3A%22alle%22%7D"
        page_url = f"{parent_url}?route=aussteller/blaettern&&start={start_value}&paginatevalues=%7B%22stichwort%22%3A%22%22%2C%22suchart%22%3A%22alle%22%7D"
        child_urls.extend(scrape_child_urls(page_url))
    
    all_data = []
    for child_url in child_urls:
        data = scrape_data_from_child_url(child_url)
        all_data.append(data)
    
    df = pd.concat([pd.DataFrame(data) for data in all_data], ignore_index=True)
    return df

# Example usage
result_df = main()
#print(result_df)
