### 0.Crawl suspended stock list from NASDAQ

In [1]:
import re
from bs4 import BeautifulSoup
import urllib
import pandas as pd
import requests
import json

In [2]:
url = 'https://listingcenter.nasdaq.com/IssuersPendingSuspensionDelisting.aspx'
response = urllib.request.urlopen(url)
soup = BeautifulSoup(response, 'html.parser')

In [3]:
table = soup.find('table',{'class':'rgMasterTable'})

In [4]:
stock_list = []
for tr in table.tbody.findAll('tr'):
    row = []
    for td in tr.findAll('td'):
        text = td.getText()
        if text == '\xa0':
            text = 'None'
        row.append(text)
    stock_list.append(row)

In [5]:
df = pd.DataFrame(stock_list)
df.columns = ['Issuer Name','Symbol','Reason','Status','Effective Date','Form 25 Date']

In [6]:
df

Unnamed: 0,Issuer Name,Symbol,Reason,Status,Effective Date,Form 25 Date
0,Altus Midstream Company,ALTMW,Regulatory/Non Compliance,Suspended,12/20/2018,2/26/2019
1,"Beneficial Bancorp, Inc.",BNCL,Acquisition/Merger,Suspended,3/1/2019,2/28/2019
2,"Concrete Pumping Holdings, Inc.",BBCPW,Regulatory/Non Compliance,Suspended,1/17/2019,2/26/2019
3,"Eco-Stim Energy Solutions, Inc.",ESES,Regulatory/Non Compliance,Suspended,1/2/2019,2/26/2019
4,Elbit Imaging Ltd.,EMITF,Regulatory/Non Compliance,Suspended,2/11/2019,
5,Helios and Matheson Analytics Inc,HMNY,Regulatory/Non Compliance,Suspended,2/13/2019,
6,Invesco Actively Managed Exchange-Traded Fund ...,LALT,Liquidation,Suspended,2/21/2019,2/28/2019
7,Invesco Exchange-Traded Fund Trust II,DWLV,Liquidation,Suspended,2/21/2019,2/28/2019
8,Invesco Exchange-Traded Fund Trust II,PAGG,Liquidation,Suspended,2/21/2019,2/28/2019
9,Invesco Exchange-Traded Fund Trust II,PSAU,Liquidation,Suspended,2/21/2019,2/28/2019


#### Filter results with the status, as some may resume trading.

In [7]:
suspended_df = df[df['Status'] == "Suspended"]
suspended_df

Unnamed: 0,Issuer Name,Symbol,Reason,Status,Effective Date,Form 25 Date
0,Altus Midstream Company,ALTMW,Regulatory/Non Compliance,Suspended,12/20/2018,2/26/2019
1,"Beneficial Bancorp, Inc.",BNCL,Acquisition/Merger,Suspended,3/1/2019,2/28/2019
2,"Concrete Pumping Holdings, Inc.",BBCPW,Regulatory/Non Compliance,Suspended,1/17/2019,2/26/2019
3,"Eco-Stim Energy Solutions, Inc.",ESES,Regulatory/Non Compliance,Suspended,1/2/2019,2/26/2019
4,Elbit Imaging Ltd.,EMITF,Regulatory/Non Compliance,Suspended,2/11/2019,
5,Helios and Matheson Analytics Inc,HMNY,Regulatory/Non Compliance,Suspended,2/13/2019,
6,Invesco Actively Managed Exchange-Traded Fund ...,LALT,Liquidation,Suspended,2/21/2019,2/28/2019
7,Invesco Exchange-Traded Fund Trust II,DWLV,Liquidation,Suspended,2/21/2019,2/28/2019
8,Invesco Exchange-Traded Fund Trust II,PAGG,Liquidation,Suspended,2/21/2019,2/28/2019
9,Invesco Exchange-Traded Fund Trust II,PSAU,Liquidation,Suspended,2/21/2019,2/28/2019


### 1. Build query structure
#### query_list0 is for Google News; query_list1 is for Globenewswire

In [8]:
ticker = df.to_dict('index')

In [9]:
def build_query_list():
    query_list0 = []
    query_list1 = []
    for i in ticker:
        comp = re.sub(r'[,\.]',' ',ticker[i]['Issuer Name']).split()
        str_comp = '+'.join(comp)
        str_comp1 = ' '.join(comp)
        comp.append(ticker[i]['Symbol'])
        query = '+'.join(comp)
        query1 = ' '.join(comp)
        query_list0.append(query)
        query_list1.append(query1)
    return (query_list0, query_list1)

In [10]:
(query_list0, query_list1) = build_query_list()

#### After testing different combinations, we found that company_name+ticker is best for Globenewswire, while company_name+ticker+"stock"+"suspended" is bester for Google News.

#### After testing different combinations of selecting the top-k retrieved results, we found that simply selecting the top 1 result from both news sources give a nice peroformance.

### 2.1 Google News Crawler

In [11]:
# return top 1 results
def parseSingleNews(query):
    url = 'https://news.google.com/rss/search?q='+query+'+stock+suspend&hl=en-US&gl=US&ceid=US:en'
    response = urllib.request.urlopen(url)  
    soup = BeautifulSoup(response, 'html.parser')
    
    channel = soup.find('channel')
    news_list = []
    for item in soup.findAll('item')[0:1]:
        des = ''
        abst = ''
        description = item.find('description').get_text()
        match = re.search(r'(href=")(.*)(" target)',description)
        match1 = re.search(r'(<p>)(.*)(</p>)',description)
        if (match):
            des = match.group(2)
        if (match1):
            abst = match1.group(2)
        news_item = {
            'title':item.find('title').get_text(),
            'pubdate':item.find('pubdate').get_text(),
            'link':des,
            'abstract':abst
        }
        news_list.append(news_item)
    return news_list

In [12]:
def allQuery(query_list):
    dict = {}
    for q in query_list:
        q1 = q.replace('+',' ')
        news_list = parseSingleNews(q)
        dict[q1]=news_list
    return dict

In [13]:
google_news = allQuery(query_list0)

### 2.2 Globenewswire Crawler

In [14]:
query_list2 = []
for q in query_list1:
    qry = {'keyword':q}
    query_list2.append(qry)

In [15]:
# return top 1 results
def parseSingleNews1(query):
    r = requests.get('https://globenewswire.com/Search', params=query)
    result = BeautifulSoup(r.text,'html.parser')
    contents = result.find_all("div", attrs = {"class" : "results-link"})
    news_list = []
    for content in contents[0:1]: 
        a = content.find("h1", attrs = {"class" : "post-title16px"}).find("a")
        title = a.text
        pubdate = content.find("span",attrs = {"class" : "dt-green"}).get_text()
        link = a['href']
        abstract = content.find_all("p")[1].text
        news_item = {
            'title':title,
            'pubdate':pubdate,
            'link':'https://globenewswire.com/'+link,
            'abstract':abstract
        }
        news_list.append(news_item)
    return news_list

In [19]:
def allQuery1(query_list):
    dict = {}
    for q in query_list:
        news_list = parseSingleNews1(q)
        dict[q['keyword']]=news_list
    return dict

In [21]:
globe_news = allQuery1(query_list2)

# Merge two sets

In [22]:
all_news = {}
for key in google_news:
    all_news[key] = google_news[key]
    all_news[key] += globe_news[key]

In [23]:
for k,v in all_news.items():
    print("Company name: {}".format(k))
    for i in v:
        print("\tRetrieved result:\t{}".format(i))
    print("------------------------------------\n")

Company name: Altus Midstream Company ALTMW
	Retrieved result:	{'title': 'Form PREM14A Kayne Anderson Acquisiti For: Aug 27 - StreetInsider.com', 'pubdate': 'Mon, 27 Aug 2018 07:00:00 GMT', 'link': 'https://www.streetinsider.com/SEC+Filings/Form+PREM14A+Kayne+Anderson+Acquisiti+For%3A+Aug+27/14551530.html', 'abstract': 'Filed by the Registrant ☒ Filed by a Party other than the Registrant ☐. Check the appropriate box: ☒, Preliminary Proxy Statement. ☐, Confidential, for Use of the ...'}
	Retrieved result:	{'title': 'Altus Midstream Company Acquires 15 Percent of Gulf Coast Express Pipeline', 'pubdate': 'December 19, 2018', 'link': 'https://globenewswire.com//news-release/2018/12/19/1669403/0/en/Altus-Midstream-Company-Acquires-15-Percent-of-Gulf-Coast-Express-Pipeline.html', 'abstract': 'HOUSTON, Dec.  19, 2018  (GLOBE NEWSWIRE) -- Altus Midstream Company (Nasdaq: ALTM, ALTMW) today announced its subsidiary, Altus Midstream LP, has exercised and closed its option with Kinder Morgan ...'

## Load to json file

In [31]:
formatted_results = {}
formatted_results["news"] = []

In [33]:
for k,v in all_news.items():
    result = {}
    result["company_name"] = k
    result["company_news"] = v
    formatted_results["news"].append(result)

In [36]:
import json
with open('result.json', 'w') as fp:
    json.dump(formatted_results, fp)