In [None]:
# Libraries
from bs4 import BeautifulSoup
import bs4 as bs4
import urllib.parse
from urllib.parse import urlparse
import requests
import pandas as pd
import re
import en_core_web_sm
import time
from concurrent.futures import ThreadPoolExecutor
import math

nlp = en_core_web_sm.load()

## 1. Website Scraping

In [None]:
class ScrapTool:

    def visit_url(self, website_url):
            '''
            Visit URL. Download the Content. Initialize the beautifulsoup object. Call parsing methods. Return Series object.
            '''
            try:
                # Set a reasonable timeout for the request
                response = requests.get(website_url, timeout=10)
                response.raise_for_status()  # Raise an HTTPError for bad responses

                content = response.content
                soup = BeautifulSoup(content, "lxml")

                result = {
                    "website_url": website_url,
                    "website_name": self.get_website_name(website_url),
                    "website_text": self.get_html_title_tag(soup) + self.get_html_meta_tags(soup) +
                                    self.get_html_heading_tags(soup) + self.get_text_content(soup)
                }

                return pd.Series(result)

            except requests.RequestException as e:
                print(f"Error accessing {website_url}: {e}")
                return None

    def get_website_name(self,website_url):
        '''
        Example: returns "google" from "www.google.com"
        '''
        return "".join(urlparse(website_url).netloc.split(".")[-2])

    def get_html_title_tag(self, soup):
        '''Return the text content of <title> tag from a webpage'''
        title_tag = soup.title
        if title_tag:
            return '. '.join(title_tag.contents)
        else:
            return ""

    def get_html_meta_tags(self,soup):
        '''Returns the text content of <meta> tags related to keywords and description from a webpage'''
        tags = soup.find_all(lambda tag: (tag.name=="meta") & (tag.has_attr('name') & (tag.has_attr('content'))))
        content = [str(tag["content"]) for tag in tags if tag["name"] in ['keywords','description']]
        return ' '.join(content)

    def get_html_heading_tags(self,soup):
        '''returns the text content of heading tags. The assumption is that headings might contain relatively important text.'''
        tags = soup.find_all(["h1","h2","h3","h4","h5","h6"])
        content = [" ".join(tag.stripped_strings) for tag in tags]
        return ' '.join(content)

    def get_text_content(self,soup):
        '''returns the text content of the whole page with some exception to tags. See tags_to_ignore.'''
        tags_to_ignore = ['style', 'script', 'head', 'title', 'meta', '[document]',"h1","h2","h3","h4","h5","h6","noscript"]
        tags = soup.find_all(string=True)
        result = []
        for tag in tags:
            stripped_tag = tag.strip()
            if tag.parent.name not in tags_to_ignore\
                and isinstance(tag, bs4.element.Comment)==False\
                and not stripped_tag.isnumeric()\
                and len(stripped_tag)>0:
                result.append(stripped_tag)
        return ' '.join(result)


def clean_text(doc):
    '''
    Clean the document. Remove pronouns, stopwords, lemmatize the words and lowercase them
    '''
    doc = nlp(doc)
    tokens = []
    exclusion_list = ["nan"]
    for token in doc:
        if token.is_stop or token.is_punct or token.text.isnumeric() or (token.text.isalnum()==False) or token.text in exclusion_list :
            continue
        token = str(token.lemma_.lower().strip())
        tokens.append(token)
    return " ".join(tokens)

In [None]:
# Function to perform web scraping
def scrape_website(website):
    scrapTool = ScrapTool()

    try:
        result = scrapTool.visit_url(website)

        if result is not None:
            web = dict(result)
            return clean_text(web['website_text'])
        else:
            return None

    except Exception as e:
        print(f"An unexpected error occurred for {website}: {e}")
        return None

# Function to add 'http://' to the website URLs
def add_http(url):
    if not url.startswith('http://') and not url.startswith('https://'):
        return 'http://' + url
    return url

## 2. Loading Dataset

In [None]:
df = pd.read_excel("file_path.xlsx", sheet_name='Sheet2')

In [None]:
df = df.dropna()

In [None]:
df

Unnamed: 0,URL
0,fh-mittelstand.de
1,cpmc.frankfurt-school.de
2,hwg-lu.de
3,hs-mittweida.de
4,rwth-aachen.de
...,...
1955,maxfunsports.com
1956,partner.fr.de
1957,hausundgarten-profi.de
1958,firmen.faz.net


In [None]:
# Add 'http://' to the 'Website' column
df['URL'] = df['URL'].apply(add_http)

In [None]:
# Apply the web scraping function and create a new 'Web_Text' column
df['Web_Text'] = df['Website'].apply(scrape_website)

## 3. Scraping with Multi-threading. 

In [None]:
# Assuming your original DataFrame is named df

# Step 1: Split DataFrame into 31 smaller DataFrames
num_splits = 6
chunk_size = math.ceil(len(df) / num_splits)

dfs = [df.iloc[i:i + chunk_size] for i in range(0, len(df), chunk_size)]

# Step 3: Apply the scrape_website function to each DataFrame using ThreadPoolExecutor
with ThreadPoolExecutor(max_workers=6) as executor:
    # Define a helper function to apply scrape_website to each row

    def apply_scrape_website(sub_df):
        sub_df['Web_Text'] = sub_df['Domains'].apply(scrape_website)
        return sub_df

    # Submit tasks for each smaller DataFrame
    futures = [executor.submit(apply_scrape_website, sub_df) for sub_df in dfs]

    # Wait for all tasks to complete
    results = [future.result() for future in futures]

# Step 4: Combine the results into a single DataFrame
final_df = pd.concat(results, ignore_index=True)

# Now final_df contains the original DataFrame with 'Web_Text' column populated by scrape_website function


In [None]:
final_df

Unnamed: 0,Domains,Web_Text
0,http://news.yahoo.com,yahoo news latest breaking news headlines live...
1,http://internationalinvestment.net,computing uk lead source analysis business tec...
2,http://cysec.gov.cy,επιτροπή κεφαλαιαγοράς κύπρου η αποστολή μας ε...
3,http://find-and-update.company-information.ser...,find update company information company inform...
4,http://laverdadpa.com,la verdad panamá es una página de noticiassala...
...,...,...
13384,http://mdzol.com,mdz onlineel diario más importante de la argen...
13385,http://tycsports.com,tyc sports las noticia de deportes del canal e...
13386,http://a24films.com,a24the company talk uncut gems midsommar lady ...
13387,http://losandesri.com,peruvian bolivian cuisine los andes restaurant...


In [None]:
final_df['Web_Text'].isna().sum()

2987

In [None]:
def extract_domain(url):
    parsed_url = urllib.parse.urlparse(url)
    return parsed_url.netloc

final_df['Domains'] = df['Domains'].apply(extract_domain)

final_df.to_excel("Domains_Text.xlsx", index=False)

In [None]:
final_df.to_csv("Domains_Text.csv", index=False)