In [19]:
import pandas as pd
import nltk
from collections import Counter
from nltk.text import Text
import nltk.corpus
import sys
import spacy
from spacy import displacy

In [151]:
!pip install -U spacy

Collecting spacy
[?25l  Downloading https://files.pythonhosted.org/packages/53/66/facc29889e0be6cceb64cbb9d4dff45a3defee79b333d41c8a2597eb6b5e/spacy-2.3.2-cp37-cp37m-macosx_10_9_x86_64.whl (10.0MB)
[K     |████████████████████████████████| 10.0MB 2.2MB/s eta 0:00:01
[?25hCollecting thinc==7.4.1 (from spacy)
[?25l  Downloading https://files.pythonhosted.org/packages/f8/3f/9cee434ca42cd7902c1369038daf6c78e06bc101750aa75c6eed1a7bdf03/thinc-7.4.1-cp37-cp37m-macosx_10_9_x86_64.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 6.8MB/s eta 0:00:01
[?25hCollecting preshed<3.1.0,>=3.0.2 (from spacy)
[?25l  Downloading https://files.pythonhosted.org/packages/c7/3e/9aaba1f8c0cb69e57ebeb411cc1b65b3f6bfc3572dd68969a6d3e59288f6/preshed-3.0.2-cp37-cp37m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl (211kB)
[K     |████████████████████████████████| 215kB 8.5MB/s eta 0:00:01
[?25hCollecting plac<1.2.0,>=0.9.6 (from spacy)
  Download

In [20]:
news_cat_df = pd.read_pickle('news_cat.pkl')

In [21]:
news_cat_df.head()

Unnamed: 0,crawled,language,text,title
0,2018-01-30T23:03:51.004+02:00,english,by Abhishek K Global Telehandler Market 2023 D...,Global Telehandler Market 2023 Demand by Segme...
1,2018-01-30T23:06:46.024+02:00,english,favorite this post 2014 Caterpillar 314E LCR h...,2014 Caterpillar 314E LCR
2,2018-01-30T23:18:35.023+02:00,english,By: MAX NISEN The Amazon health care threat ha...,"Amazon, Berkshire, JPMorgan health announcemen..."
3,2018-01-30T23:20:54.012+02:00,english,QR Code Link to This Post MONTHLY PUBLIC AUCTI...,2005 Caterpillar CB534D Tandem Vibratory Rolle...
4,2018-01-30T23:28:30.000+02:00,english,QR Code Link to This Post 2007 CATERPILLAR D4G...,2007 CATERPILLAR D4G LGP CAB SCREEN/SWEEPS - O...


In [22]:
news_cat_df.shape

(100, 4)

In [23]:
news_cat_df['language'].value_counts()

english    100
Name: language, dtype: int64

In [24]:
# Discarding non-english results
news_cat_df = news_cat_df[news_cat_df['language'] == 'english']

In [115]:
news_cat_df.shape

(100, 4)

In [25]:
#Identifying what companies are mentioned most frequently along with caterpillar
news_cat_df = news_cat_df[(news_cat_df['text'].str.contains('Caterpillar')) | (news_cat_df['text'].str.contains('CATERPILLAR')) | (news_cat_df['text'].str.contains('CAT')) | (news_cat_df['text'].str.contains('Cat'))\
& (news_cat_df['title'].str.contains('Caterpillar')) | (news_cat_df['title'].str.contains('CATERPILLAR')) | (news_cat_df['title'].str.contains('CAT')) | (news_cat_df['title'].str.contains('Cat'))]

In [26]:
news_cat_df.head()

Unnamed: 0,crawled,language,text,title
0,2018-01-30T23:03:51.004+02:00,english,by Abhishek K Global Telehandler Market 2023 D...,Global Telehandler Market 2023 Demand by Segme...
1,2018-01-30T23:06:46.024+02:00,english,favorite this post 2014 Caterpillar 314E LCR h...,2014 Caterpillar 314E LCR
3,2018-01-30T23:20:54.012+02:00,english,QR Code Link to This Post MONTHLY PUBLIC AUCTI...,2005 Caterpillar CB534D Tandem Vibratory Rolle...
4,2018-01-30T23:28:30.000+02:00,english,QR Code Link to This Post 2007 CATERPILLAR D4G...,2007 CATERPILLAR D4G LGP CAB SCREEN/SWEEPS - O...
5,2018-01-30T23:33:58.023+02:00,english,Elite Wealth Management Inc. Acquires Shares o...,Elite Wealth Management Inc. Acquires Shares o...


# Entities labels creation

In [27]:
def create_entities_labels(x):
    entities = []
    labels = []

    # chunking the entire title and text using NLTK into sentences in the form of a tree structire ot list of lists.
    # The values can be retrieved by traversing the trees or list of lists
    
    for i in nltk.sent_tokenize(x):
        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(i)), binary = False):
            if hasattr(chunk, 'label') and chunk.label()=='ORGANIZATION':
                entities.append(' '.join(c[0] for c in chunk)) #Add space as between multi-token entities
                labels.append(chunk.label())

    entities_labels = list(zip(entities, labels))  #unique entities
    entities_df = pd.DataFrame(entities_labels)
    if entities_df.empty:
        return pd.DataFrame({'entity':[], 'label':[], 'Count':[]})
    else:
        entities_df.columns = ["entity", "label"]
        return entities_df.groupby(['entity']).label.value_counts().reset_index(name='Count')


# Aggregating the labels

In [28]:
def aggregate_labels(item):
    item = item.sort_values('Count', ascending=False)
    return item.groupby('entity').agg({'label' : lambda x: x.iloc[0], 'Count' : 'sum', }).reset_index()

The "crawled" feature in the dataframe retrieved from the pickle file represents articles. Now we are going to create chunks out of each sentences of these articles and then tokenize them. Entities and labels can be created after tokenizing and then we can count how many times each entity was mentioned in each of those articles. 

In [29]:
total_labels_text = []
total_labels_title = []

for i, each_row in news_cat_df.iterrows():
    
    labels_in_text = aggregate_labels(create_entities_labels(each_row['text']))
    labels_in_text['no_of_articles'] = 1
    total_labels_text.append(labels_in_text)
    
    labels_in_titles = aggregate_labels(create_entities_labels(each_row['title']))
    labels_in_titles['no_of_articles'] = 1
    total_labels_title.append(labels_in_titles)
    
text_concat  = pd.concat(total_labels_text)
title_concat = pd.concat(total_labels_title)

In [30]:
results_df = pd.merge(
    text_concat.groupby(['entity', 'label'])\
        .agg({'Count':'sum', 'no_of_articles': 'sum'})\
        .reset_index()\
        .rename(columns={'Count':'mentions_articles'}),
    title_concat.groupby(['entity', 'label'])\
        .agg({'Count':'sum', 'no_of_articles': 'sum'})\
        .reset_index()\
        .rename(columns={
            'Count':'mentions_titles', 
            'no_of_articles': 'no_of_titles'}),
    how='outer',
    on=['entity', 'label']).fillna(0)

In [31]:
results_df['total_no_mentions'] = results_df.mentions_articles + results_df.mentions_titles
results_df.sort_values('total_no_mentions', ascending=False, inplace=True)

In [32]:
results_df.head(60)

Unnamed: 0,entity,label,mentions_articles,no_of_articles,mentions_titles,no_of_titles,total_no_mentions
149,Caterpillar Inc.,ORGANIZATION,83.0,23.0,13.0,13.0,96.0
144,Caterpillar,ORGANIZATION,87.0,33.0,0.0,0.0,87.0
465,NYSE,ORGANIZATION,59.0,22.0,4.0,4.0,63.0
97,CAT,ORGANIZATION,43.0,25.0,12.0,12.0,55.0
142,Cat,ORGANIZATION,36.0,17.0,0.0,0.0,36.0
166,Company,ORGANIZATION,27.0,17.0,0.0,0.0,27.0
595,SEC,ORGANIZATION,23.0,14.0,0.0,0.0,23.0
359,JPMorgan,ORGANIZATION,21.0,12.0,0.0,0.0,21.0
666,Transportation,ORGANIZATION,20.0,18.0,0.0,0.0,20.0
246,Exchange Commission,ORGANIZATION,20.0,14.0,0.0,0.0,20.0


In [33]:
results_df = results_df[~results_df.entity.isin(['Caterpillar', 'Caterpillar Inc.', 'CAT', 'Cat'])]
results_df.head(10)

Unnamed: 0,entity,label,mentions_articles,no_of_articles,mentions_titles,no_of_titles,total_no_mentions
465,NYSE,ORGANIZATION,59.0,22.0,4.0,4.0,63.0
166,Company,ORGANIZATION,27.0,17.0,0.0,0.0,27.0
595,SEC,ORGANIZATION,23.0,14.0,0.0,0.0,23.0
359,JPMorgan,ORGANIZATION,21.0,12.0,0.0,0.0,21.0
666,Transportation,ORGANIZATION,20.0,18.0,0.0,0.0,20.0
246,Exchange Commission,ORGANIZATION,20.0,14.0,0.0,0.0,20.0
579,Resource Industries,ORGANIZATION,20.0,19.0,0.0,0.0,20.0
615,Securities,ORGANIZATION,20.0,14.0,0.0,0.0,20.0
174,Construction Industries,ORGANIZATION,19.0,18.0,0.0,0.0,19.0
273,Financial Products,ORGANIZATION,19.0,18.0,0.0,0.0,19.0


In [34]:
results_df = results_df[(results_df.label != 'GPE')] 
results_df = results_df[~results_df.entity.isin(['Transportation','SEC', 'Securities', 'Stock', 'Exchange Commission'])]
results_df = results_df[~results_df.entity.isin(['Energy', 'Resource Industries', 'Company', 'Financial Products', 'Construction Industries'])]
results_df = results_df[~results_df.entity.isin(['News', 'NOT', 'LLC', 'Partners', 'Thomas', 'Investment', 'Construction', 'Bank'])]
results_df.head(20)

Unnamed: 0,entity,label,mentions_articles,no_of_articles,mentions_titles,no_of_titles,total_no_mentions
465,NYSE,ORGANIZATION,59.0,22.0,4.0,4.0,63.0
359,JPMorgan,ORGANIZATION,21.0,12.0,0.0,0.0,21.0
393,Lincolnian Online,ORGANIZATION,12.0,6.0,0.0,0.0,12.0
706,Vista Partners,ORGANIZATION,11.0,3.0,0.0,0.0,11.0
566,Ratings,ORGANIZATION,11.0,11.0,0.0,0.0,11.0
443,Motley Fool,ORGANIZATION,10.0,5.0,0.0,0.0,10.0
208,Dow,ORGANIZATION,9.0,3.0,0.0,0.0,9.0
106,CFO Bradley,ORGANIZATION,8.0,8.0,0.0,0.0,8.0
694,VIOLATION,ORGANIZATION,8.0,8.0,0.0,0.0,8.0
747,GENERATORS,ORGANIZATION,0.0,0.0,8.0,8.0,8.0


In [35]:
results_df = results_df[(results_df.label == 'ORGANIZATION')] 

In [36]:
results_df.head(20)

Unnamed: 0,entity,label,mentions_articles,no_of_articles,mentions_titles,no_of_titles,total_no_mentions
465,NYSE,ORGANIZATION,59.0,22.0,4.0,4.0,63.0
359,JPMorgan,ORGANIZATION,21.0,12.0,0.0,0.0,21.0
393,Lincolnian Online,ORGANIZATION,12.0,6.0,0.0,0.0,12.0
706,Vista Partners,ORGANIZATION,11.0,3.0,0.0,0.0,11.0
566,Ratings,ORGANIZATION,11.0,11.0,0.0,0.0,11.0
443,Motley Fool,ORGANIZATION,10.0,5.0,0.0,0.0,10.0
208,Dow,ORGANIZATION,9.0,3.0,0.0,0.0,9.0
106,CFO Bradley,ORGANIZATION,8.0,8.0,0.0,0.0,8.0
694,VIOLATION,ORGANIZATION,8.0,8.0,0.0,0.0,8.0
747,GENERATORS,ORGANIZATION,0.0,0.0,8.0,8.0,8.0


In [37]:
results_df = results_df[~results_df.entity.isin(['Ratings','GENERATORS', 'NASDAQ', 'Wonderland', 'Dow', 'NYSE'])]
results_df.head(20)

Unnamed: 0,entity,label,mentions_articles,no_of_articles,mentions_titles,no_of_titles,total_no_mentions
359,JPMorgan,ORGANIZATION,21.0,12.0,0.0,0.0,21.0
393,Lincolnian Online,ORGANIZATION,12.0,6.0,0.0,0.0,12.0
706,Vista Partners,ORGANIZATION,11.0,3.0,0.0,0.0,11.0
443,Motley Fool,ORGANIZATION,10.0,5.0,0.0,0.0,10.0
106,CFO Bradley,ORGANIZATION,8.0,8.0,0.0,0.0,8.0
694,VIOLATION,ORGANIZATION,8.0,8.0,0.0,0.0,8.0
228,EPS,ORGANIZATION,8.0,8.0,0.0,0.0,8.0
253,FMR,ORGANIZATION,8.0,4.0,0.0,0.0,8.0
738,DIESEL,ORGANIZATION,0.0,0.0,8.0,8.0,8.0
137,Capital Group,ORGANIZATION,8.0,4.0,0.0,0.0,8.0


Entity label creation using NLTK didnt yield expected or ideal results for us but still its much better from where we started out.

I have also used spacy . Even that did not deliver good results