In [1]:
from bs4 import BeautifulSoup, Tag
import requests
import re
import pandas as pd
import numpy as np
import time 
from selenium import webdriver
from selenium.webdriver.firefox.options import Options 
from selenium.webdriver.common.by import By
import glob

## Scrapping the data from SeekingAlpha
We are interested in the earning call transcripts SeekingAlpha provides. There is no way to easily export those transcripts, even less so without paying for membership. As such, our best option is scrapping fast enough before we're blocked by the paywall.
There is a lot of JS and browser shenanigans happening that prevent us from fetching the data by simple HTTP GETs, to overcome this, we use Selenium to automate the fetching process. 
For instance, we have to scroll down when scrapping transcripts in order for the browser to fetch the entirety of the transcript. However, doing so, we raise an automated failsafe that blocks us from interacting with the page anymore. We're fast enough to get the texts before we're blocked however. 

In [2]:
# Get all links to transcripts from an index page
def get_transcripts_links(driver, page_idx):
    driver.get(f'https://www.seekingalpha.com/earnings/earnings-call-transcripts?page={page_idx}')
    time.sleep(0.1)
    soup = BeautifulSoup(driver.page_source, 'lxml')
    return [a['href'] for a in soup.find_all('a', class_='km-v')]


In [3]:
# Gets the transcript from the link before the paywall appears
def grab_transcript(driver, link):
    
    driver.get("https://www.seekingalpha.com" + link)
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight/2);")
    time.sleep(0.1)

    soup = BeautifulSoup(driver.page_source, 'lxml')
    
    article = soup.find_all('div', attrs={'data-test-id':'article-selection'})
    article.extend(soup.find_all(class_='paywall-full-content'))
    time.sleep(1)
    return article, []
    # return article, soup

In [4]:
# Check if the tag has bold content in it (i.e. the speaker's name)
def has_bold_content(tag):
    for child in tag.children:
        if isinstance(child, Tag):
            if child.name =='strong':
                return True
    return False

In [5]:
# Checks that we are not scrapping something we've acquired already
def is_already_scrapped(link, glob):
    return link in glob

In [6]:
options = Options()

driver = webdriver.Firefox(options=options)


# Scraps the list of links to the transcripts 
transcripts_links = []
for i in range(50, 150):
   transcripts_links.extend(get_transcripts_links(driver, i))




# Scraps the transcripts
transcripts = {}

for i, transcript in enumerate(transcripts_links):
    print(f'{i}/{len(transcripts_links)}')
    transcripts[transcript] = grab_transcript(driver, transcript) 


driver.quit()
# Saves the transcripts
transcripts_glob = glob.glob("transcripts/*.txt")
for transcript_link in transcripts:
    if transcript_link.split('/')[-1][8:]+".txt" in transcripts_glob:
        pass
    with open("transcripts/" + transcript_link.split('/')[-1][8:]+".txt", 'w') as f:
        
        f.write(
            "\n"
            .join([
                f"[{p.text}]" if has_bold_content(p) else p.text 
                for p in transcripts[transcript_link][0]]))
        



0/3960
1/3960
2/3960
3/3960
4/3960
5/3960
6/3960
7/3960
8/3960
9/3960
10/3960
11/3960
12/3960
13/3960
14/3960
15/3960
16/3960
17/3960
18/3960
19/3960
20/3960
21/3960
22/3960
23/3960
24/3960
25/3960
26/3960
27/3960
28/3960
29/3960
30/3960
31/3960
32/3960
33/3960
34/3960
35/3960
36/3960
37/3960
38/3960
39/3960
40/3960
41/3960
42/3960
43/3960
44/3960
45/3960
46/3960
47/3960
48/3960
49/3960
50/3960
51/3960
52/3960
53/3960
54/3960
55/3960
56/3960
57/3960
58/3960
59/3960
60/3960
61/3960
62/3960
63/3960
64/3960
65/3960
66/3960
67/3960
68/3960
69/3960
70/3960
71/3960
72/3960
73/3960
74/3960
75/3960
76/3960
77/3960
78/3960
79/3960
80/3960
81/3960
82/3960
83/3960
84/3960
85/3960
86/3960
87/3960
88/3960
89/3960
90/3960
91/3960
92/3960
93/3960
94/3960
95/3960
96/3960
97/3960
98/3960
99/3960
100/3960
101/3960
102/3960
103/3960
104/3960
105/3960
106/3960
107/3960
108/3960
109/3960
110/3960
111/3960
112/3960
113/3960
114/3960
115/3960
116/3960
117/3960
118/3960
119/3960
120/3960
121/3960
122/3960
123

In [7]:
# In case we ever get blocked by SeekingAlpha when they'll increase their robustness, we can use a multitude of different agents to avoid being blocked
import random
user_agent_list = [
   #Chrome
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
    'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win  64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
    #Firefox
    'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',
    'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
    'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)',
    'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
    'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
    'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)'
]
user_agent = random.choice(user_agent_list)



In [8]:

parse_site = 'https://seekingalpha.com/article/4333315-swiss-re-ag-ssref-2019-annual-report-conference-call-transcript?part=single'
html_content = requests.get(parse_site, headers={'User-Agent': user_agent}).text
print (BeautifulSoup(html_content, 'lxml'))

<!DOCTYPE html>
<html lang="en" prefix="og: https://ogp.me/ns#"><head><meta charset="utf-8"/><title data-static-tag-from="prerender">Swiss Re AG (SSREF) 2019 Annual Report Conference Call (Transcript) | Seeking Alpha</title><link href="https://static.seekingalpha.com" rel="preconnect"/><link href="https://static1.seekingalpha.com" rel="preconnect"/><link href="https://static2.seekingalpha.com" rel="preconnect"/><link href="https://static3.seekingalpha.com" rel="preconnect"/><link crossorigin="anonymous" href="https://www.googletagmanager.com" rel="preconnect"/><link crossorigin="anonymous" href="https://www.google-analytics.com" rel="preconnect"/><meta content="width=device-width,initial-scale=1,viewport-fit=cover" name="viewport"/><meta content="ie=edge" http-equiv="X-UA-Compatible"/><meta content="Seeking Alpha" name="application-name"/><meta content="True" name="HandheldFriendly"/><meta content="yes" name="apple-mobile-web-app-capable"/><meta content="SeekingAlpha" name="apple-mobil

In [9]:
with requests.Session() as s:

    req = s.get(base_url, headers=headers)

NameError: name 'base_url' is not defined

In [None]:
req

<Response [200]>

In [None]:
## Get the list of links to the transcripts
soup = BeautifulSoup(req.content, 'html.parser')
links = soup.find_all('a', {'class': 'dashboard-article-link'})
links = [link.get('href') for link in links]
links

[]

In [None]:
# Legacy code
def get_date(c):
    end = c.find('|')
    return c[0:end-1]

def get_ticker(c):
    beg = c.find('(')
    end = c.find(')')
    return c[beg+1:end]

def grab_page(url):
    print("attempting to grab page: " + url)
    page = requests.get(url)
    page_html = page.text
    soup = BeautifulSoup(page_html, 'html.parser')

    meta = soup.find("div",{'class':'a-info get-alerts'})
    content = soup.find(id="a-body")

    if type(meta) or type(content) == "NoneType":
        print("skipping this link, no content here")
        return
    else:
        text = content.text
        mtext = meta.text

        filename = get_ticker(mtext) + "_" + get_date(mtext)
        file = open(filename.lower() + ".txt", 'w')
        file.write(text)
        file.close
        print(filename.lower()+ " sucessfully saved")

def process_list_page(i):
    origin_page = "https://seekingalpha.com/earnings/earnings-call-transcripts" + "/" + str(i)
    print("getting page " + origin_page)
    page = requests.get(origin_page)
    page_html = page.text
    #print(page_html)
    soup = BeautifulSoup(page_html, 'html.parser')
    alist = soup.find_all("li",{'class':'list-group-item article'})
    for i in range(0,len(alist)):
        url_ending = alist[i].find_all("a")[0].attrs['href']
        url = "https://seekingalpha.com" + url_ending
        grab_page(url)
        time.sleep(.5)