In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

In [2]:
import xml.etree.ElementTree as ET

In [3]:
from dateutil import parser

In [25]:
import csv

In [5]:
class Article:
    
    def __init__(self, company, timestamp, headline, article_text):
        self.company = company
        self.timestamp = parser.parse(timestamp)
        self.headline = headline
        self.article_text = article_text
    
    def __repr__(self):
        return self.company + "; " + str(self.timestamp) + "; " + self.headline
    
    def __eq__(self, other):
        if isinstance(other, Article):
            return (self.company == other.company and self.timestamp == other.timestamp and
                self.headline == other.headline)
        return False
    
    def __lt__(self, other):
        """
        Comparison operator for sorting Articles by timestamp. 
        """
        return self.timestamp < other.timestamp
    
    def __hash__(self):
        return hash(self.__repr__())
    
    def elapsed_time_between(self, other):
        """
        Returns the amount of time, in seconds, between the publishing 
        of this Article and another Article, other. 
        """
        elapsed_time = self.timestamp - other.timestamp
        return abs(elapsed_time.total_seconds())

~~Parsing works, but is slow for the large file. Need to implement filtering out articles that are older than 72 hours whenever a new article is added, and also put some stuff in functions.~~

In [57]:
def create_article_map(filename, output_csv_name, k_hours=72):
    """
    Main parsing function. Takes in a file with .nml extension
    and returns a dictionary with keys that correspond to company symbols,
    and values that are sets of Article objects whose articles are about
    that company. 
    
    Arguments:
        filename: .nml file to parse
        output_csv_name: Name of .csv file to output with articles and similarity
            information. 
        k_hours: Argument passed in to filter_old_articles(). Determines
            article filtering. Default is 72, so articles mapped to any
            given company which are at least 72 hours older than the
            current article being parsed will be filtered from the map.
    
    Returns:
        company_article_map: Map from companies to sets of Articles about
            those companies. The set for each company will contain articles that
            were published within k_hours hours of each other. 
    """
    company_article_map = {}
    curr_file_str = ""
    header_df = pd.DataFrame(columns=["company", "headline", "time", 
                                     "old_score", "is_reprint", "is_recombination"])
    header_df.to_csv(output_csv_name, index = False)
    

    with open(filename) as myfile:
        for next_line in myfile:
            curr_file_str += next_line
            if next_line == "</doc>\n":

                xml_elem = ET.fromstring(curr_file_str)
                company = xml_elem.find(".//djn-company-sig")
                if company is None:
                    curr_file_str = ""
                    continue
                if company[0].attrib.get('about', False) != 'Y':
                    curr_file_str = ""
                    continue
                company = company[0].text
                timestamp = xml_elem.find(".//djn-mdata").attrib['display-date']
                headline = xml_elem.find(".//headline").text
                all_text = xml_elem.find(".//text")
                article_text = "".join(all_text.itertext())

                new_article = Article(company, timestamp, headline, article_text)
                company_articles = filter_old_articles(company_article_map, new_article, k_hours)
                
                if len(company_articles) == 0:
                    company_articles.add(new_article)
                    company_article_map[company] = company_articles
                    curr_file_str = ""
                    continue
                else:
                    old, closest_neighbor = old_and_closest_neighbor_score(new_article, company_articles)
                    new_row = [new_article.company, new_article.headline, new_article.timestamp,
                              old, is_reprint(old, closest_neighbor),
                              is_recombination(old, closest_neighbor)]
                    with open(output_csv_name, "a") as f:
                        csv_writer = csv.writer(f)  
                        csv_writer.writerow(new_row)
                    company_articles.add(new_article)
                    company_article_map[company] = company_articles
                    
                curr_file_str = ""
                num_articles_read += 1

    return company_article_map

In [58]:
full_sample = create_article_map("2001_sample_10M.nml", "bow_similarity.csv")

Just parsed article 2000
Just parsed article 4000
Just parsed article 6000
Just parsed article 8000
Just parsed article 10000
Just parsed article 12000
Just parsed article 14000
Just parsed article 16000
Just parsed article 18000
Just parsed article 20000
Just parsed article 22000
Just parsed article 24000
Just parsed article 26000
Just parsed article 28000
Just parsed article 30000
Just parsed article 32000
Just parsed article 34000
Just parsed article 36000
Just parsed article 38000
Just parsed article 40000
Just parsed article 42000
Just parsed article 44000
Just parsed article 46000
Just parsed article 48000
Just parsed article 50000
Just parsed article 52000
Just parsed article 54000
Just parsed article 56000


In [10]:
sorted(full_sample['AAPL'], reverse=True)

[AAPL; 2001-01-26 13:00:00+00:00; 
 SMARTMONEY.COM: Strategies For Survival In PCs,
 AAPL; 2001-01-26 01:08:00+00:00; 
 SMARTMONEY.COM: Strategies For Survival In PCs,
 AAPL; 2001-01-25 04:49:00+00:00; 
 WSJ(1/25): Latest Apple Laptop Offers A New Look And Feel]

In [49]:
def filter_old_articles(company_article_map, curr_article, k_hours=72):
    """
    Check the set of articles mapped to curr_article.company, and 
    filter out articles that are at least k_hours older than curr_article. 
    
    Arguments:
        curr_article: An Article object, used to compare time stamps with all other
            articles in company_article_map[curr_article.company]. 
        k_hours: An int, default 72. All articles in company_article_map[curr_article.company] 
            that have at least k_hours elapsed time between that article and 
            curr_article.timestamp will be filtered from the map.
            
    Returns:
        company_article_set: A new set to map to curr_article.company, with 
            old articles filtered out, with "old" specified as k_hours.
    """
    
    k_seconds = k_hours * 60 * 60
    curr_company = curr_article.company
    curr_timestamp = curr_article.timestamp
    
    company_article_set = company_article_map.get(curr_company, False)
    
    if company_article_set == False:
        return set()
    
    articles_to_remove = {article for article in company_article_set 
                         if curr_article.elapsed_time_between(article) >= k_seconds}
    company_article_set.difference_update(articles_to_remove)
    
    return company_article_set  

In [31]:
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [32]:
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

In [43]:
def stem_and_filter(article):
    """
    Takes an Article object and tokenizes the article text into a list, then
    removes stop words from the list and stems all the remaining words. Returns
    a set of stemmed words in the article.
    """
    text = article.article_text
    tokenized = set(word_tokenize(text))
    tokenized.difference_update(stop_words) # Remove stop words from tokenized text
    stemmed = {ps.stem(w) for w in tokenized}
    return stemmed

In [44]:
def bow_similarity_score(s1, s2):
    """
    Returns the bag-of-words similarity score between an article s1 and article s2.
    Specifically, measures percentage of words in s1 that are also in s2. 
    s1, s2 must be sets representing tokenized and stemmed articles.
    """
    return len(s1.intersection(s2)) / len(s1)

In [45]:
# company_article_map = full_sample
def old_and_closest_neighbor_score(curr_article, article_set, num_closest=5):
    """
    Calculates Old(s) and ClosestNeighbor(s), where s is curr_article. 
    
    Arguments:
        curr_article: An Article object for which to calculate the scores
        article_set: A set of Article objects with the same company as curr_article.company
        num_closest: The number of closest (by similarity score) stories to look at 
            to calculate Old(s)
    
    Returns:
        old_score: Old(s)
        closest_neighbor_score: ClosestNeighbor(s)
    """
    
    curr_article_stemmed = stem_and_filter(curr_article)
    stemmed_articles = list(map(stem_and_filter, article_set))
    sim_scores = [bow_similarity_score(curr_article_stemmed, s) for s in stemmed_articles]
    
    closest_articles_indices = np.argsort(sim_scores)[::-1][:num_closest]
    closest_articles = [stemmed_articles[i] for i in closest_articles_indices]
    closest_articles_union = set().union(*closest_articles)
    intersect_with_closest_n = curr_article_stemmed.intersection(closest_articles_union)
    
    old_score = len(intersect_with_closest_n) / len(curr_article_stemmed)
    closest_neighbor_score = bow_similarity_score(curr_article_stemmed, closest_articles[0])
    
    return old_score, closest_neighbor_score

In [46]:
def is_old_news(old):
    return old > 0.6

In [47]:
def is_reprint(old, closest_neighbor):
    reprint = (closest_neighbor / old) >= 0.8
    return (old > 0.6) * reprint

In [48]:
def is_recombination(old, closest_neighbor):
    reprint = (closest_neighbor / old) < 0.8
    return (old > 0.6) * reprint

In [32]:
# pd.DataFrame(columns=['temp1', 'temp2']).to_csv("tmptest.csv")

In [24]:
company_article_map = full_sample
iterations = 0
rows_list = []
for company in first_20_pairs:
    article_set = company_article_map[company]
    for article in article_set:
        row_dict = {}
        if len(articles_older_than(article, article_set)) > 0:
            row_dict['company'] = company
            row_dict['time'] = article.timestamp
            row_dict['headline'] = article.headline
            old, closest_neighbor = old_and_closest_neighbor_score(article, article_set)
            row_dict['old_score'] = old
            row_dict['is_reprint'] = precomputed_is_reprint(old, closest_neighbor)
            row_dict['is_recombination'] = precomputed_is_recombination(old, closest_neighbor)
        else:
            continue
        rows_list.append(row_dict)
    iterations += 1

rows_list

[{'company': 'F.BNP',
  'headline': '\nBusiness News From The Greek Press Thursday',
  'is_recombination': 0,
  'is_reprint': 0,
  'old_score': 0.23529411764705882,
  'time': datetime.datetime(2001, 1, 25, 9, 49, tzinfo=tzutc())},
 {'company': 'F.BNP',
  'headline': "\nFirstMark's French Unit Sees Breakeven, EUR200M Rev In 04",
  'is_recombination': 0,
  'is_reprint': 0,
  'old_score': 0.24489795918367346,
  'time': datetime.datetime(2001, 1, 25, 12, 58, tzinfo=tzutc())},
 {'company': 'F.BNP',
  'headline': '\nCarrefour/Klepierre -2: Debt Financing For Purchase>F.CAR',
  'is_recombination': 0,
  'is_reprint': 0,
  'old_score': 0.3611111111111111,
  'time': datetime.datetime(2001, 1, 25, 16, 43, tzinfo=tzutc())},
 {'company': 'MTA',
  'headline': '\nBusiness News From The Hungarian Press Thursday',
  'is_recombination': 0,
  'is_reprint': 0,
  'old_score': 0.29464285714285715,
  'time': datetime.datetime(2001, 1, 25, 9, 15, tzinfo=tzutc())},
 {'company': 'C.BZQ',
  'headline': '\nIsrael

In [59]:
articles_df = pd.read_csv("bow_similarity.csv")
articles_df

Unnamed: 0,company,headline,time,old_score,is_reprint,is_recombination
0,IBP,\nTyson Deal Is $30 A Share In Cash And Stock ...,2001-01-01 18:40:18+00:00,0.875000,1,0
1,IBP,\nTyson-IBP Deal To Be Announced Later Today -...,2001-01-01 18:40:42+00:00,1.000000,1,0
2,IBP,\nIBP OKs Tyson Bid Over Smithfield All-stk Of...,2001-01-01 18:41:55+00:00,0.875000,1,0
3,IBP,\nTyson/IBP -2: Tyson To Assume $1.4 Bln In IB...,2001-01-01 18:54:16+00:00,0.034161,0,0
4,IBP,\nREPEAT:Tyson/IBP -2: Tyson To Assume $1.4 Bl...,2001-01-01 19:13:04+00:00,0.949555,1,0
5,IBP,\nTyson Foods Announces Deal To Acquire IBP,2001-01-01 20:40:45+00:00,0.875000,1,0
6,IBP,\nTyson To Pay $4.7 Billion To Acquire IBP,2001-01-01 20:41:31+00:00,0.875000,1,0
7,IBP,\nTyson To Pay $30 For Each IBP Share>IBP TSN,2001-01-01 20:42:03+00:00,0.875000,1,0
8,IBP,"\nTyson/IBP Deal Is 50.1% Cash, Remainder In S...",2001-01-01 20:42:46+00:00,1.000000,1,0
9,IBP,\nTyson To Assume $1.5 Bln In IBP Debt And Obl...,2001-01-01 20:44:28+00:00,0.875000,1,0


In [26]:
articles_df.to_csv("test_articles_similarity.csv")

In [22]:
first_20_pairs = {k: full_sample[k] for k in list(full_sample)[:20]}
first_20_pairs

{'A.WPL': {A.WPL; 2001-01-22 05:09:31+00:00; 
  Alcoa Australia Plans Statement Mon On Wagerup Expansion,
  A.WPL; 2001-01-22 06:27:12+00:00; 
  Shell Extends Bid For Australia's Woodside Until Feb 28,
  A.WPL; 2001-01-22 06:40:52+00:00; 
  Shell/Woodside Extends Bid -2: Follows Government Delay,
  A.WPL; 2001-01-22 06:47:47+00:00; 
  Shell/Woodside Extends Bid -3: Was Due To Close On Feb 2,
  A.WPL; 2001-01-23 03:56:23+00:00; 
  Alcoa Australia: No Comment On Wagerup Refinery Forecast,
  A.WPL; 2001-01-24 01:42:48+00:00; 
  Costello:Will Approve Shell Bid If In National Interest,
  A.WPL; 2001-01-24 01:51:36+00:00; 
  Costello/Woodside Offer -2: Conditions Sometimes Imposed,
  A.WPL; 2001-01-24 01:57:48+00:00; 
  MARKET TALK/AU-EQ: Costello Mulls Shell National Interest},
 'AOL': {AOL; 2001-01-26 13:16:00+00:00; 
  New FCC Chief Of Staff Served Briefly As Disney Lobbyist,
  AOL; 2001-01-26 15:16:00+00:00; 
  AOL Time Warner (AOL) Block: 300,000 Shrs At 54.00,
  AOL; 2001-01-26 15:36:0

In [249]:
aapl_stories = sorted(full_sample['GS'], reverse=True)
[is_reprint(aapl_stories[i], aapl_stories) for i in range(len(aapl_stories)) 
    if len(articles_older_than(aapl_stories[i], aapl_stories)) > 0]

[1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0]

In [237]:
aapl_stories[0] > aapl_stories[1]

True

In [199]:
print(amazon_stories[0].article_text, "\n")
print(amazon_stories[1].article_text)
stf_az1 = stem_and_filter(amazon_stories[0])
stf_az2 = stem_and_filter(amazon_stories[1])
# print(stf_az1)
# print(stf_az2)
print(bow_similarity_score(stf_az1, stf_az2))
print(len(set(stf_az1).intersection(stf_az2)) / len(set(stf_az1)))
print(len(set(stf_az1)))



 
 
   By Christiane Bird 
   Of DOW JONES NEWSWIRES 
 

  NEW YORK (Dow Jones)--Most mutual fund categories posted modest gains in the last week as blue chip stocks edged up, consumer cyclicals advanced and investors digested fourth-quarter earnings reports. 

  The major exceptions were telecommunications funds and science and technology funds, down 4.33% and 1.55% respectively, in the week ended Thursday, according to Lipper Inc. 

  Technology and telecommunications stocks had been market standouts so far this year, but they retreated on Thursday. JDS Uniphase Corp. (JDSU) dipped more than 12%, to $55.19, at the market's close Thursday, while Amazon.com Inc. (AMZN), Broadvision Inc. (BVSN), and Yahoo! Inc. (YHOO) all also lost ground. 

  Among U.S. diversified funds, value funds gained the most in the last week. Large-cap value funds were up 1.01%, while mid-cap value funds returned 1.97% and small-cap value funds rose 1.19%. 

  In comparison, large-cap growth funds gained 0.08

In [114]:
A = ['a', 'c', 'd']
B = ['c', 'd', 2]
C = [1, 2, 3]
set().union(*[A, B, C])

{1, 2, 3, 'd', 'a', 'c'}

In [104]:
newest_amzn_text = sorted(full_sample['AMZN'], reverse=True)[0].article_text
stop_words = set(stopwords.words('english'))
tokenized_article = word_tokenize(newest_amzn_text)
tokenized_filtered = [w for w in tokenized_article if w not in stop_words]
ps = PorterStemmer()
for w in tokenized_filtered:
    print(ps.stem(w))

By
christian
bird
Of
dow
jone
newswir
new
york
(
dow
jone
)
--
most
mutual
fund
categori
post
modest
gain
last
week
blue
chip
stock
edg
,
consum
cyclic
advanc
investor
digest
fourth-quart
earn
report
.
the
major
except
telecommun
fund
scienc
technolog
fund
,
4.33
%
1.55
%
respect
,
week
end
thursday
,
accord
lipper
inc.
technolog
telecommun
stock
market
standout
far
year
,
retreat
thursday
.
jd
uniphas
corp.
(
jdsu
)
dip
12
%
,
$
55.19
,
market
's
close
thursday
,
amazon.com
inc.
(
amzn
)
,
broadvis
inc.
(
bvsn
)
,
yahoo
!
inc.
(
yhoo
)
also
lost
ground
.
among
u.s.
diversifi
fund
,
valu
fund
gain
last
week
.
large-cap
valu
fund
1.01
%
,
mid-cap
valu
fund
return
1.97
%
small-cap
valu
fund
rose
1.19
%
.
In
comparison
,
large-cap
growth
fund
gain
0.08
%
,
mid-cap
growth
fund
rose
1.34
%
,
small-cap
growth
fund
1.06
%
.
large-cap
,
mid-cap
small-cap
core
fund
increas
0.67
%
,
1.7
%
1.36
%
respect
.
As
whole
,
5,105
u.s.
diversifi
fund
follow
lipper
rose
averag
0.93
%
week
.
dure
period
,
