# Script to scrape websites provided a .csv with URLs

## Special requirements for the environment

# Install requirements via pip etc.

In [None]:
run_in = 'local'
#run_in = 'colab'

if run_in == 'local':
    import pandas as pd
    import os
    import bs4 as BeautifulSoup
    import requests
    import numpy as np
    from urllib.parse import urlparse
    import pprint
    pp = pprint.PrettyPrinter(indent=4)
    import html2text as html2text
    import datetime
    import time
    import re
    from lxml import html
    #from bs4 import BeautifulSoup
    import json
    
    #TF-IDF stuff:
    import string
    import nltk
    #nltk.download("popular")
    from nltk.tokenize import word_tokenize
    from nltk.corpus import stopwords
    from nltk.stem.porter import PorterStemmer
      
if run_in == 'colab':
    !pip install beautifulsoup4
    !pip install requests
    !pip install html2text  
    import pandas as pd
    import os
    import bs4 as BeautifulSoup
    #from bs4 import BeautifulSoup
    import requests
    import numpy as np
    from urllib.parse import urlparse
    from urllib.parse import urljoin
    from urllib.parse import urlunsplit
    from urllib.parse import urlsplit
    import pprint
    pp = pprint.PrettyPrinter(indent=4)
    import html2text as html2text
    import datetime
    import time
    import re
    from lxml import html
    import json
    
    #TF-IDF stuff:
    import string
    import nltk
    nltk.download("popular")
    from nltk.tokenize import word_tokenize
    from nltk.corpus import stopwords
    from nltk.stem.porter import PorterStemmer

Define base paths:

In [None]:
if run_in == 'local':
  base_path = "/users/USERNAME/Google Drive/USERNAME/"
if run_in == 'colab':
  base_path = "/content/drive/My Drive/Mark/"
  from google.colab import drive
  drive.mount('/content/drive')
  
print('base_path set to:')
print(base_path)

<p><font color="red">Define Version number!</font></p>

In [None]:
version_id = 'V14'

<p><font color="red">Define whether first_time_run!</font></p>

In [None]:
first_time_run = False

#Read all start-ups and their website URLs

In [None]:
df = pd.read_excel(base_path + "Pitchbook_Crunchbase_Tracxn_Raw/Don't touch " + version_id + "/" + version_id + ".xlsx", version_id + "website_url_scrape", header=None, skiprows=1, names=['company_name','domain'], usecols="A,B")
df = df.sort_values(by=['domain'])
df = df.dropna()
df['domain'] = df['domain'].str.lower()
df.reset_index(drop=True, inplace=True)
print("We have found " + str(len(df)) + " startups")
#df = df.loc[df['company_name'] == "9fin"]
df.head()

## Check for which startups we already have all the information

Startups for which we already have a website saved

In [None]:
website_contents_path = os.path.join(base_path + "/outputs/Scraping Websites/Website Contents/")
manual_webscrape_contents_path = os.path.join(base_path + "/outputs/Manual Webscrape/Raw/")
#os.listdir(website_contents_path)

already_scraped_websites = [] # to be dropped once populated

for root, dirs, files in os.walk(website_contents_path):
    for file in files:
        if file.endswith(".json"):
            #print(os.path.splitext(file)[0])
            already_scraped_websites.append(os.path.splitext(file)[0])
            
for root, dirs, files in os.walk(manual_webscrape_contents_path):
    for file in files:
        if file.endswith(".txt"):
            #print(os.path.splitext(file)[0])
            already_scraped_websites.append(os.path.splitext(file)[0])
            
#print(already_scraped_websites)
print("We already scraped the websites of", str(len(already_scraped_websites)), "startups.")
print("We still have to scrape the websites of", str(len(df)-len(already_scraped_websites)), "startups.")

Remove the already scraped websites from the dataframe of websites which are to be scraped

In [None]:
df = df[~df['company_name'].isin(already_scraped_websites)]
print(df.shape)
df.head()

In [None]:
print("We will scrape URLs of " + str(len(df)) + " startups")

In [None]:
#df = df.loc[df['domain'] != "xrgenomics.com"]
#df = df.loc[df['company_name'] == "reinfer"]
#df

In [None]:
limit_num_of_scrapes = 100
df = df[-limit_num_of_scrapes:]
print("We limited the number of websites to be scraped to", limit_num_of_scrapes)
df.head()

Create company2domain dictionary

In [None]:
company2domain = dict(zip(df.domain, df.company_name))
print(company2domain)

In [None]:
#print(company2domain["techspert.io"])
#print("I think this should yield the company name Techspert...")
#print("I want to save the websites in a file with, startup, website")
#print("next have to change how the files are read in. They are read in by line but now have to be split and taken line[0]")

In [None]:
pp.pprint(company2domain)

### Check for duplicates

In [None]:
url_occurrences = df['domain'].value_counts()

if url_occurrences.max() > 1:
    print(url_occurrences[url_occurrences > 1])
else:
    print("There were no duplicate URLs for startups")

In [None]:
list_of_domains = df['domain'].tolist()
#print(list_of_domains)

# Define functions:

Check if url responds

In [None]:
def try_url(url):
  parsed_url = urlparse(url).geturl()
  try:
    result = requests.get(parsed_url)
    if result.status_code == 200:
      return(True)
    else:
      print(url,result.status_code)
  except:
    return(False)

Check if url scheme is correct, if not, repair

In [None]:
def url_check(no_scheme_url):
  
  if no_scheme_url.startswith("www."):
    print("no_scheme_url starts with www ->", no_scheme_url)
    no_scheme_url = no_scheme_url[4:]
    print("no_scheme_url is now->", no_scheme_url)
    
  if ("http://" not in no_scheme_url and "https://" not in no_scheme_url):
    no_scheme_url = "https://" + no_scheme_url
  
  worked = try_url(no_scheme_url)

  if not worked:
    no_scheme_url = "http://" + no_scheme_url.split("//")[1]
    #print(no_scheme_url)

    worked = try_url(no_scheme_url)
    if not worked:
      return (False, None)
    else:
      return(True, no_scheme_url)
  
  else:
    return(True, no_scheme_url)

Text cleaning

In [None]:
def text_cleaner(text):
   # 1 Take all types of hyphens apart from U+002D 
   de_hyphened_em = str.replace(text, "—", " ")
   de_hyphened_en = str.replace(de_hyphened_em, "–", " ")
   # 2 Tokenize
   tokens = word_tokenize(de_hyphened_en)
   tokens = [item.replace("—", " ") for item in tokens]
   # 3 Remove puctuation
   table = str.maketrans('', '', string.punctuation)
   stripped = [w.translate(table) for w in tokens]
   # Remove lonely letters
   no_single = [w for w in stripped if len(w) > 1]
   # 4 Lower Case
   low_words = [word.lower() for word in no_single]
   # 5 Not Alpha
   no_alpha = [w for w in low_words if w.isalpha()]
   # 6 Delete Stop Words
   stop_words = set(stopwords.words('english'))
   no_stop = [w for w in no_alpha if not w in stop_words]
   # 10 Stem Words
   porter = PorterStemmer()
   stemmed = [porter.stem(word) for word in no_stop] 
   return stemmed

In [None]:
# TEST APPEND WRITING
# list_of_broken_domains = ["A","D"]
# list_of_domains_test = ["A","B","C","D"]

# for url in list_of_domains_test:
#     if url not in list_of_broken_domains:
#         print(url)

# with open(base_path + 'outputs/Scraping Websites/list_of_broken_domains.txt', 'a+') as f:
#     for item in list_of_broken_domains:
#         f.write("%s\n" % item)

In [None]:
# TEST FILE READING
# with open(base_path + 'outputs/Scraping Websites/list_of_broken_domains.txt') as broken_domain_file:
#   list_of_broken_domains = [word for line in broken_domain_file for word in line.split()]
# list_of_broken_domains

If run from scratch

In [None]:
if first_time_run:
    list_of_scrapable_domains = []
    list_of_broken_domains = []

    for url in list_of_domains[:]:
      worked, correct_url = url_check(url)

      if worked:
        print(url, "worked")
        list_of_scrapable_domains.append(correct_url)
      else:
        list_of_broken_domains.append(url)


    print("works ", list_of_scrapable_domains)
    print("")
    print("not ", list_of_broken_domains)

    with open(base_path + 'outputs/Scraping Websites/list_of_broken_domains.txt', 'a+') as f:
        for item in list_new_of_broken_domains:
                company = company2domain[item]
                problem_company = str(item) + " | " + str(company)
                print(problem_company)
                f.write("%s\n" % problem_company)

If broken links should be omitted

In [None]:
# with open(base_path + 'outputs/Scraping Websites/list_of_broken_domains.txt') as broken_domain_file:
#   list_of_broken_domains = []
#   for line in broken_domain_file:
#     #print(line)
#     url = line.strip().split(' | ')[1]
#     list_of_broken_domains.append(url)
# print(list_of_broken_domains)

In [None]:
if not first_time_run:
    list_of_scrapable_domains = []
    # obtain existing list_of_broken_domains from .txt file
    with open(base_path + 'outputs/Scraping Websites/list_of_broken_domains.txt') as broken_domain_file:
      list_new_of_broken_domains = []
      list_of_broken_domains = []
      for line in broken_domain_file:
        #print(line)
        url = line.strip().split(' | ')[1]
        list_of_broken_domains.append(url)

    for url in list_of_domains:
      if url not in list_of_broken_domains:
        #print(url)
        worked, correct_url = url_check(url)

        if worked:
          list_of_scrapable_domains.append(correct_url)
          print("SUCCESS!", correct_url, "saved to","list_of_scrapable_domains")
        else:
          list_new_of_broken_domains.append(url)
          print(url, "saved to","list_of_broken_domains")
      if url in list_of_broken_domains:
        print(url, "was sorted out because it is known to be broken")

    # print("works ", list_of_scrapable_domains)
    # print("")
    # print("not ", list_of_broken_domains)
    with open(base_path + 'outputs/Scraping Websites/list_of_broken_domains.txt', 'a+') as f:
        for item in list_new_of_broken_domains:
            company = company2domain[item]
            problem_company = str(item) + " | " + str(company)
            print(problem_company)
            f.write("%s\n" % problem_company)

In [None]:
#list_of_broken_domains

##Obtain list of all working domains

In [None]:
print("We have found " + str(len(list_of_scrapable_domains)) + " urls where the main website responded")

##Find all sub websites on all main websites and save them in a list

In [None]:
list_of_results = []

for k in list_of_scrapable_domains[:50]:
  try:
    print(k)
    
    if "http://" in k or "https://" in k:
      domain = k.split("//")[1]
    else: domain = k
    
    netloc = urlparse(k).netloc
    try:
      company_name = company2domain[domain]
      #print("company_name", company_name)
      #print("company2domain[domain]", company2domain[domain])
    except:  
      company_name = "error"
      print("this one had an error", company_name, k)
        
    result = {
        'domain': k,
        'netloc': netloc,
        'company_name': company_name,
    }
    
    # create three lists which will contain the chosen and dropped urls
    list_of_additionally_found_but_irrelevant_urls = []
    list_of_additionally_found_urls = []
    list_of_unusable_additionally_found_urls = []
    
    # obtain all links from the main website
    r = requests.get(k)
    html_content = r.text
    soup = BeautifulSoup.BeautifulSoup(html_content, 'lxml') 
    temp_links = [a.get('href') for a in soup.find_all('a', href=True)]
    #print("\033[1m" + "temp_links" + "\033[0m",temp_links)
    
    # delete all links which had '#', '/' or were duplicates
    temp_links_no_endchar = set()
    for t in temp_links:
        if "www." in t:
            t = t.replace("www.", "")
        if t.endswith((('#', '/'))):
            #print("t before",t)
            t = t[:-1]
            #print("t after",t)  
        if t == "":
            pass
        else:
            temp_links_no_endchar.add(t)
    print("temp_links_no_endchar", temp_links_no_endchar)

    # check if all links have a domain and scheme
    links = []
    for link in temp_links_no_endchar:
        #print(link)
        if urlparse(link).scheme in ('http', 'https',):
            #print("no action required",link)
            links.append(link)
        else:
            # check if the determined urls work and add to links list
            worked, fixed_link = url_check(domain + link)
            #print("fixed_link", fixed_link)
            links.append((fixed_link))
    print("\033[1m" + "links" + "\033[0m",links)
    links = [x for x in links if x is not None]
    
    # add links to respective list
    processed_links = []
    for l in links:
        print("processed_link",l)
        print("netloc is:", netloc, "urlparse is", urlparse(l).netloc)
        if netloc in urlparse(l).netloc:
            processed_links.append(l)
        else:
            print("bad", l)
            list_of_additionally_found_but_irrelevant_urls.append("netloc not in url: " + l)
            
    rule_outs = [".pdf",".jpg",".jpeg",".png","mailto","twitter"]
    processed_links = [l for l in processed_links if not any(out in l for out in rule_outs)]
    # add main page to the links in case it did not link to itself
    processed_links.append(k)
    print("processed_links",processed_links)    

    print("\033[1m" + "No more duplicates in Links" + "\033[0m")
    # check if all links work
    for link in processed_links:
        print(link)
        worked, correct_link = url_check(link)
        if worked:
            list_of_additionally_found_urls.append(correct_link)
        else:
            list_of_unusable_additionally_found_urls.append(("unsuccessful url check",link))
    
    list_of_additionally_found_urls = list(set(list_of_additionally_found_urls))
    #list_of_unusable_additionally_found_urls = list(set(list_of_unusable_additionally_found_urls))
        
    result['list_of_additionally_found_urls'] = list_of_additionally_found_urls
    #result['list_of_unusable_additionally_found_urls']  = list_of_unusable_additionally_found_urls
    result['list_of_additionally_found_but_irrelevant_urls'] = list_of_additionally_found_but_irrelevant_urls
      
    list_of_results.append(result)

    #create a dataframe with all additionally_found_but_irrelevant_urls
    list_company_name_for_df_of_additionally_found_but_irrelevant_urls = []
    df_of_additionally_found_but_irrelevant_urls = pd.DataFrame()
    for item in list_of_additionally_found_but_irrelevant_urls:
      list_company_name_for_df_of_additionally_found_but_irrelevant_urls.append(company_name)
      
    df_of_additionally_found_but_irrelevant_urls["company_name"] = list_company_name_for_df_of_additionally_found_but_irrelevant_urls
    df_of_additionally_found_but_irrelevant_urls["domain"] = list_of_additionally_found_but_irrelevant_urls
    
  except:
    print("Exception next ")

##Iterate over all sub-websites and apply functions

In [None]:
#print(len(list_of_results[:40])

##Scrape all websites

In [None]:
all_company_dict = {}
list_of_accessible_but_unscrapable_domains = []

for result in list_of_results:
    try:
        company_name = result["company_name"]   
        result["raw_web_content"] = {}
        result["stemmed_web_content"] = {}
        result["set_stemmed_web_content"] = {}
        list_stemmed_entire_web_content = []
        result["set_stemmed_entire_web_content"] = []

        for res in result['list_of_additionally_found_urls'][:]:

            print(res)
            source = requests.get(res)
            html_content = source.text
            html_content = re.sub("<head>.*?</head>", "", html_content, flags=re.DOTALL)
            html_content = re.sub("<footer>.*?</footer>", "", html_content, flags=re.DOTALL)
            html_content = re.sub("<span.*?</span>", "", html_content, flags=re.DOTALL)
            raw_website_content = html2text.html2text(html_content)
            raw_website_content = re.sub(r'\([^)]*\)', '', raw_website_content) # get rid of stuff in round brackets ( )
            raw_website_content = re.sub(r'!?\[[^\]]*\]', '', raw_website_content) # get rid of stuff in brackets []
            raw_website_content = re.sub(r'<[^\>]*>',' ',raw_website_content) # get rid of stuff in brackets <>
            #raw_website_content = re.sub(r'\d', '', raw_website_content) # get rid of all digits
            raw_website_content = re.sub(r'[^a-zA-Z\n\s\d\.\,\!\?\'\:]+', '', raw_website_content) # get rid of special characters apart from \n space , . ! ? ' :
            raw_website_content = re.sub(r'(?<!\n)\n{1,}', ' ', raw_website_content) # get rid of anything more than x newlines
            raw_website_content = re.sub(' +',' ',raw_website_content) # get rid of everything more than one 'Space'
            raw_website_content = raw_website_content.lstrip()

            #all_company_dict[company_name][res] = raw_website_content
            #company_dict[res] = raw_website_content
            #pp.pprint(company_dict)
            #put content of the raw sub-website in dictionary
            result["raw_web_content"][res] = raw_website_content

            # stem the raw sub-website and put in dictionary
            stemmed = text_cleaner(raw_website_content)
            result["stemmed_web_content"][res] = stemmed

            # set of the stem dictionary for sub-website
            set_stemmed = str(set(stemmed))
            result["set_stemmed_web_content"][res] = set_stemmed

        for sub_webpage, sub_webpage_content in result["set_stemmed_web_content"].items():
            content = sub_webpage_content.replace('{', '').replace('}', '').replace("'", "").replace(',', '')
            content = content.split()
            list_stemmed_entire_web_content.extend(content)
        set_stemmed_entire_web_content = str(set(list_stemmed_entire_web_content))

        result["set_stemmed_entire_web_content"] = set_stemmed_entire_web_content

        with open(website_contents_path + company_name + '.json', 'w') as website_contents:
          json.dump(result, website_contents)
          print(company_name, "website saved")
    except:
        print("Did not work")
        list_of_accessible_but_unscrapable_domains.append(res)
        
with open(base_path + 'outputs/Scraping Websites/list_of_accessible_but_unscrapable_domains.txt', 'a+') as f:
    for item in list_of_accessible_but_unscrapable_domains:
        f.write("%s\n" % item)

##Saved to:

In [None]:
print("saved to:", website_contents_path)