In [1]:
### import required libraries

import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

In [2]:
### set date range for weekly report
date_range_high = datetime.today().date()
date_range_low = datetime.today().date() - timedelta(days=7)

In [3]:
user_agent = 'Mozilla/5.0 (iPad; U; CPU OS 3_2_1 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Mobile/7B405'

#### Reuters M&A News

In [None]:
### webscrape Reuters M&A News
date_filter = date_range_high
page_number = 1
reuters_news_dict = {}
reuters_news_url = "https://www.reuters.com/news/archive/mergersnews?view=page&page={}&pageSize=10"

while date_filter >= date_range_low:

    reuters_raw = requests.get(reuters_news_url.format(page_number), headers={'User-Agent': user_agent})
    reuters_bs4 = BeautifulSoup(reuters_raw.content)

    reuters_news_block = reuters_bs4.find_all('div', class_ = 'column1 col col-10')
    reuters_news_list  = reuters_news_block[0].find_all('h3', class_ = 'story-title')
    reuters_timestamp_list = reuters_news_block[0].find_all('span', class_ = 'timestamp')

    for i in range(len(reuters_news_list)):
        title = reuters_news_list[i].get_text()
        title = title.split('\n\t\t\t\t\t\t\t\t')[1]

        a_date = reuters_timestamp_list[i].get_text()
        if 'am' in a_date or 'pm' in a_date:
            a_date = datetime.today().date()
        else:
            a_date = datetime.strptime(a_date, "%b %d %Y").date()

        reuters_news_dict[title] = a_date

    date_filter = a_date
    page_number += 1

In [None]:
reuters_news_dict = {title: date for title, date in reuters_news_dict.items() if date >= date_range_low}

#### WSJ Deals News

In [None]:
wsj_deals_url = "https://www.wsj.com/news/types/deals-deal-makers?page={}"
page_number = 1
date_filter = date_range_high

while date_filter >= date_range_low:
    wsj_raw = requests.get(wsj_deals_url.format(page_number), headers={'User-Agent': user_agent})
    wsj_bs4 = BeautifulSoup(wsj_raw.content)

    title_list = []
    for artilce in wsj_bs4.select('h2[class*="headline"]'):
        content = artilce.get_text()
        title_list.append(content)

    date_list = []
    for timestamp in wsj_bs4.select('div[class*="timestamp"]'):
        a_date = timestamp.get_text()
        a_date = datetime.strptime(a_date, "%B %d, %Y").date()
        date_list.append(a_date)

    date_filter = a_date
    page_number += 1

In [None]:
wsj_news_dict = dict(zip(title_list, date_list))
wsj_news_dict = {title: date for title, date in wsj_news_dict.items() if date >= date_range_low}

In [None]:
wsj_news_dict

In [1]:
options = Options()
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
s = Service('chromedriver/chromedriver')
driver = webdriver.Chrome(service= s)

#### theMiddleMarket M&A News

In [2]:
ma_news_url = 'https://www.themiddlemarket.com/latest-news'
driver.get(ma_news_url)

latest_news = driver.find_elements(By.XPATH,"/html/body/main/div/div/div/div[1]/div/div")
latest_news = latest_news[0].text.split('\n')

ma_article_list = [i for count, i in enumerate(latest_news) if count%2 == 0 ] 
ma_date_list = [i for count, i in enumerate(latest_news) if count%2 != 0 ] 
theMiddleMarket_news_dict =  dict(zip(ma_article_list, ma_date_list))

In [None]:
theMiddleMarket_news_dict = {key: datetime.strptime(value.title(), '%B %d, %Y').date() for key, value in theMiddleMarket_news_dict.items()}
theMiddleMarket_news_dict = {title: date for title, date in theMiddleMarket_news_dict.items() if date >= date_range_low}

In [None]:
theMiddleMarket_news_dict

#### New York Times Mergers News

In [None]:
nyt_url = "https://www.nytimes.com/topic/subject/mergers-acquisitions-and-divestitures"
driver.get(nyt_url)
nyt_news_raw = driver.find_elements(By.XPATH, '//*[@id="collection-Mergers, Acquisitions and Divestitures"]/div[1]/div')[0].text.split('\n')

In [None]:
index_list = []
date_list = []
for count, text in enumerate(nyt_news_raw):
    try:
        a_date = datetime.strptime(text, "%b. %d, %Y")
        date_list.append(a_date)
        index_list.append(count -2)
    except:
        pass
    
title_list = [nyt_news_raw[i] for i in index_list]
nyt_ma_news_dict = dict(zip(title_list, date_list))

In [None]:
driver.close()

In [None]:
combined_dict = {}
for i in [nyt_ma_news_dict, theMiddleMarket_news_dict, reuters_news_dict]:
    for k,v in i.items():
        combined_dict[k] = v

In [None]:
len(combined_dict)

In [None]:
elephants = pd.read_excel('elephants.xlsx')
elephants_dict = elephants.set_index('Client').to_dict('index')

In [None]:
elephants_dict["BNP Paribas"] = {'Broker': 'xxx', 'Broker Email': 'xxx.xxx@cbre.com'}
elephants_dict["DemandDrive"] = {'Broker': 'xxx', 'Broker Email': 'xxx.xxx@cbre.com'}
elephants_dict['Intel'] = {'Broker': 'xxx', 'Broker Email': 'xxx.xxx@cbre.com'}

In [None]:
results = {}

for client in elephants_dict.keys():
    client_res = []
    for title, date in combined_dict.items():
        if client.lower() in title.lower():
            client_res.append(title)
    if client_res:
        results[client] = {'news': client_res, 'broker': elephants_dict[client]['Broker'], 'broker_email':elephants_dict[client]['Broker Email']}

In [None]:
results_df = pd.DataFrame(results).T.reset_index().rename(columns = {'index': 'company'})
results_df

#### Seeking Alpha M&A News

In [None]:
article_list = []
formatted_date_list = []

driver.get('https://seekingalpha.com/market-news/m-a?page=1')

article_block = driver.find_elements(By.XPATH, "//*[@id='content']/div/div[2]/div/div[2]/section/div/div/div/div[2]")[0].text.split('\n')

for i, j in enumerate(article_block):
    if i%2 == 0:
        article_list.append(j)

date_list = driver.find_elements(By.XPATH, "//*[@id='content']/div/div[3]/div/div[2]/section/div/div/div/div[2]/article[1]/div/div/footer/span[2]")
date_list = [i.text for i in date_list] 

for a_date in date_list:
    if 'Today' in a_date:
        a_date = datetime.today().date()
    elif 'Yesterday' in a_date:
        a_date = datetime.today().date() - timedelta(days = 1)
    elif len(a_date.split(' ')) == 3:
        a_date = a_date + ' ' + str(datetime.today().date().year)
        a_date = datetime.strptime(a_date, "%a, %b. %d %Y").date()
    else:
        a_date = datetime.strptime(a_date, "%a, %b. %d, %Y").date()
    
    formatted_date_list.append(a_date)

In [None]:
date_list

In [None]:
seeking_alpha_news_dict = dict(zip(article_list, formatted_date_list))

In [None]:
seeking_alpha_news_dict