In [3]:
import pandas as pd

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 500)

In [4]:
news_path = 'https://storage.googleapis.com/msca-bdp-data-open/news/nlp_a_5_news.json'
news_df = pd.read_json(news_path, orient='records', lines=True)

print(f'Sample contains {news_df.shape[0]:,.0f} news articles')
news_df.head(2)

Sample contains 10,012 news articles


Unnamed: 0,url,date,language,title,text
0,http://kokomoperspective.com/obituaries/jon-w-horton/article_b6ba8e1e-cb9c-11eb-9868-fb11b88b9778.html,2021-06-13,en,Jon W. Horton | Obituaries | kokomoperspective.com,Jon W. Horton | Obituaries | kokomoperspective.comYou have permission to edit this article. EditCloseSign Up Log In Dashboard LogoutMy Account Dashboard Profile Saved items LogoutCOVID-19Click here for the latest local news on COVID-19HomeAbout UsContact UsNewsLocalOpinionPoliticsNationalStateAgricultureLifestylesEngagements/Anniversaries/WeddingsAutosEntertainmentHealthHomesOutdoorsSportsNFLNCAAVitalsObituariesAutomotivee-EditionCouponsGalleries74°...
1,https://auto.economictimes.indiatimes.com/news/auto-components/birla-precision-to-ramp-up-capacity-to-tap-emerging-opportunities-in-india/81254902,2021-02-28,en,"Birla Precision to ramp up capacity to tap emerging opportunities in India, Auto News, ET Auto","Birla Precision to ramp up capacity to tap emerging opportunities in India, Auto News, ET Auto We have updated our terms and conditions and privacy policy Click ""Continue"" to accept and continue with ET AutoAccept the updated privacy & cookie policyDear user, ET Auto privacy and cookie policy has been updated to align with the new data regulations in European Union. Please review and accept these changes below to continue using the website.You can see our privacy policy & our cookie ..."


In [5]:
### Read Tweets ##
tweets_path = 'https://storage.googleapis.com/msca-bdp-data-open/tweets/nlp_a_5_tweets.json'
tweets_df = pd.read_json(tweets_path, orient='records', lines=True)
print(f'Sample contains {tweets_df.shape[0]:,.0f} tweets')
tweets_df.head(2)

Sample contains 10,105 tweets


Unnamed: 0,id,lang,date,name,retweeted,text
0,1534565117614084096,en,2022-06-08,Low Orbit Tourist 🌍📷,,"Body &amp; Assembly - Halewood - United Kingdom\n🌍53.3504,-2.8352296,402m\n\nHalewood Body &amp; Assembly is a Jaguar Land Rover factory in Halewood, England, and forms the major part of the Halewood complex which is shared with Ford who manufacture transmissions at the site. [Wikipedia] https://t.co/LPmCnZIaVt"
1,1534565743429394439,en,2022-06-08,CompleteCar.ie,RT,"Land Rover Ireland has announced that the new Range Rover Sport starts at €114,150, now on @completecar:\n\nhttps://t.co/TjGUkL3FYr https://t.co/QdVaEiJkjO"


In [6]:
##### Discarding Non English Results and keeping only relevant columns ####
def discard_en(df,col):
    df = df[df[col]=="en"]


discard_en(news_df,"language")
discard_en(tweets_df,"lang")

In [7]:
news_df = news_df[["title","text"]]
tweets_df = tweets_df[["text"]]

In [8]:
import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
nltk.download('stopwords')
nlp = spacy.load("en_core_web_sm")
stopwords = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [9]:
import multiprocessing
num_processors = multiprocessing.cpu_count()
print(f'Available CPUs: {num_processors}')

Available CPUs: 2


In [10]:
import re
def cleaning(df,col):

    df[col] = df[col].str.strip()
    df[col] = df[col].apply(lambda x: re.sub(r'\||\n|(@\w+.*?)|(http\w\S+.*?)|(#\w+)',' ',x))

In [11]:
cleaning(news_df,"title")
cleaning(news_df,"text")
cleaning(tweets_df,"text")

In [14]:
#### Finding top company name using NER-NTLK #####

##### Without Sentence Segmentation #####

def ner_nltk(df,col):
    ORG=[]
    for text in df[col]:
        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(text))):
            if hasattr(chunk,"label") and chunk.label()== 'ORGANIZATION':
                ORG.extend([c for c in chunk])

    org_counts = {}
    for org in ORG:
        if org[0] in org_counts:
            org_counts[org[0]] += 1
        else:
            org_counts[org[0]] = 1

    sorted_org = sorted(org_counts.items(), key=lambda x: x[1], reverse=True)
    top_20_orgs = sorted_org[:20]

    return(top_20_orgs)

In [12]:
##### News Articles (Title) #####
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [15]:
ner_nltk(news_df,"title")

[('News', 675),
 ('Star', 292),
 ('Auto', 236),
 ('Online', 214),
 ('Mail', 213),
 ('Daily', 192),
 ('AutoSpies', 144),
 ('CoventryLive', 137),
 ('Business', 122),
 ('Automotive', 106),
 ('BMW', 95),
 ('Express', 94),
 ('NewsBreak', 92),
 ('Car', 92),
 ('Shropshire', 90),
 ('GMC', 87),
 ('ET', 77),
 ('UK', 77),
 ('Volkswagen', 74),
 ('Land', 66)]

In [16]:
#### News Articles (Text) ####
news_df_sample = news_df.sample(n=1000,random_state=420)

In [17]:
ner_nltk(news_df_sample,"text")

[('LA', 1357),
 ('NYC', 1286),
 ('News', 1078),
 ('Princess', 1012),
 ('MailOnline', 925),
 ('Prince', 912),
 ('Kate', 893),
 ('VERY', 732),
 ('Queen', 626),
 ('UK', 623),
 ('Duke', 613),
 ('Royal', 599),
 ('Awards', 598),
 ('Of', 531),
 ('House', 517),
 ('US', 494),
 ('COVID', 482),
 ('Land', 458),
 ('THE', 431),
 ('Duchess', 429)]

In [18]:
ner_nltk(tweets_df,"text")

[('Land', 3441),
 ('Rover', 2796),
 ('BMW', 394),
 ('Discovery', 393),
 ('Motors', 314),
 ('General', 300),
 ('Jaguar', 294),
 ('LAND', 261),
 ('eBay', 223),
 ('UK', 204),
 ('Duke', 192),
 ('Duchess', 171),
 ('SHAMELESS', 157),
 ('Defender', 146),
 ('SUV', 136),
 ('Range', 119),
 ('Services', 114),
 ('Health', 106),
 ('Invictus', 94),
 ('ROVER', 89)]

In [19]:
#######  With Sentence Segmentation ##########3
def sent_tokenizer(df,col):
    new_col_name = col + "_sent_tokens"
    df[new_col_name] = df[col].apply(lambda x : nltk.tokenize.sent_tokenize(x))


def ner_nltk_sent(df,col):
    ORG=[]
    for row in df[col]:
        for token in row:
            for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(token))):
                if hasattr(chunk,"label") and chunk.label()== 'ORGANIZATION':
                    ORG.extend([c for c in chunk])

    org_counts = {}
    for org in ORG:
        if org[0] in org_counts:
            org_counts[org[0]] += 1
        else:
            org_counts[org[0]] = 1

    sorted_org = sorted(org_counts.items(), key=lambda x: x[1], reverse=True)
    top_20_orgs = sorted_org[:20]

    return(top_20_orgs)

In [20]:
sent_tokenizer(news_df,"title")
sent_tokenizer(news_df,"text")
sent_tokenizer(news_df_sample,"text")
sent_tokenizer(tweets_df,"text")

In [21]:
ner_nltk_sent(news_df,"title_sent_tokens")

[('News', 675),
 ('Star', 289),
 ('Online', 238),
 ('Mail', 237),
 ('Auto', 236),
 ('Daily', 192),
 ('AutoSpies', 144),
 ('CoventryLive', 137),
 ('Business', 122),
 ('Automotive', 106),
 ('BMW', 96),
 ('Express', 94),
 ('NewsBreak', 92),
 ('Car', 92),
 ('Shropshire', 90),
 ('GMC', 87),
 ('ET', 77),
 ('UK', 77),
 ('Volkswagen', 74),
 ('Land', 66)]

In [22]:
ner_nltk_sent(news_df,"title_sent_tokens")

[('LA', 1356),
 ('NYC', 1286),
 ('News', 1084),
 ('Princess', 1019),
 ('MailOnline', 925),
 ('Prince', 913),
 ('Kate', 868),
 ('VERY', 730),
 ('Queen', 627),
 ('UK', 623),
 ('Duke', 620),
 ('Royal', 600),
 ('Awards', 598),
 ('Of', 531),
 ('House', 523),
 ('US', 491),
 ('COVID', 483),
 ('Land', 465),
 ('Duchess', 416),
 ('Philip', 415)]

In [23]:
ner_nltk_sent(tweets_df,"text_sent_tokens")

[('Land', 3448),
 ('Rover', 2846),
 ('Discovery', 394),
 ('BMW', 390),
 ('Motors', 314),
 ('Jaguar', 302),
 ('General', 300),
 ('LAND', 262),
 ('eBay', 223),
 ('UK', 204),
 ('Duke', 192),
 ('Duchess', 171),
 ('SHAMELESS', 157),
 ('Defender', 146),
 ('SUV', 136),
 ('Range', 118),
 ('Services', 114),
 ('Health', 106),
 ('Invictus', 94),
 ('ROVER', 90)]

Finding top company name using NER SpaCy

In [24]:
#####  Without Sentence Segmentation ####
def ner_spacy(df, col):
    entities = []
    labels = []
    for i in df[col]:
        doc = nlp(i)
        for ent in doc.ents:
            if ent.label_ == "ORG":
                entities.append(ent.text)
                labels.append(ent.label_)
    ent_df = pd.DataFrame({'Entities': entities, 'Labels': labels})
    ent_gpd = ent_df.groupby("Entities").count().sort_values(by="Labels", ascending=False).head(20)

    return ent_gpd

In [25]:
def ner_spacy_pipe(df, col):
    entities = []
    labels = []
    for doc in nlp.pipe(texts=df.loc[:, col], n_process=num_processors-5, batch_size=300):
        for ent in doc:
            if ent.ent_type_ == "ORG":
                entities.append(ent.text)
                labels.append(ent.ent_type_)
    ent_df = pd.DataFrame({'Entities': entities, 'Labels': labels})
    ent_gpd = ent_df.groupby("Entities").count().sort_values(by="Labels", ascending=False).head(20)

    return ent_gpd

In [28]:
ner_spacy(news_df, "title")

Unnamed: 0_level_0,Labels
Entities,Unnamed: 1_level_1
Ford,265
Hyundai,207
Star News,191
Chevrolet,165
Toyota,162
Honda,147
Shropshire Star,108
Automotive News,108
BMW,108
Express & Star,103


In [27]:
ner_spacy(tweets_df, "text")

Unnamed: 0_level_0,Labels
Entities,Unnamed: 1_level_1
Land Rover,1041
Jaguar Land Rover,940
eBay,477
BMW,383
General Motors,292
"Mercedes-Benz, Citroen",285
Jaguar,175
Ford,116
Audi,99
Volvo,93


In [30]:
%time ner_spacy(news_df_sample,"text")

CPU times: user 8min 19s, sys: 18.4 s, total: 8min 38s
Wall time: 8min 44s


Unnamed: 0_level_0,Labels
Entities,Unnamed: 1_level_1
MailOnline,925
COVID-19,642
Ford,641
Toyota,527
Instagram,405
Hyundai,399
Honda,393
Trump,376
BMW,369
Amazon,366


In [31]:
### With Sentance Segmentation #####
def ner_spacy_sent(df,col):
    entities=[]
    labels=[]
    for row in df[col]:
        for token in row:
            doc=nlp(token)
            for ent in doc.ents:
                if ent.label_ == "ORG":
                    entities.append(ent.text)
                    labels.append(ent.label_)
    ent_df = pd.DataFrame({'Entities':entities,'Labels':labels})
    ent_gpd = ent_df.groupby("Entities").count().sort_values(by="Labels",ascending=False).head(20)

    return ent_gpd

In [32]:
ner_spacy_sent(news_df,"title_sent_tokens")

Unnamed: 0_level_0,Labels
Entities,Unnamed: 1_level_1
Ford,266
Hyundai,207
Star News,191
Chevrolet,165
Toyota,162
Honda,147
Shropshire Star,108
Automotive News,108
BMW,108
Express & Star,103


In [33]:
ner_spacy_sent(news_df_sample,"text_sent_tokens")

Unnamed: 0_level_0,Labels
Entities,Unnamed: 1_level_1
MailOnline,925
COVID-19,644
Ford,643
Toyota,527
Instagram,406
Hyundai,400
Honda,390
Trump,378
BMW,375
Amazon,366


In [34]:
from IPython.display import display_html
from itertools import chain,cycle
def display_side_by_side(*args,titles=cycle([''])):
    html_str=''
    for df,title in zip(args, chain(titles,cycle(['</br>'])) ):
        html_str+='<th style="text-align:center"><td style="vertical-align:top">'
        html_str+=f'<h2 style="text-align: center;">{title}</h2>'
        html_str+=df.to_html().replace('table','table style="display:inline"')
        html_str+='</td></th>'
    display_html(html_str,raw=True)

In [35]:
display_side_by_side(ner_spacy_sent(news_df,"title_sent_tokens"),ner_spacy_sent(news_df_sample,"text_sent_tokens")
                     ,ner_spacy_sent(tweets_df,"text_sent_tokens")
                     , titles=['News Articles(Title)','News Articles(Text)','Tweets'])

Unnamed: 0_level_0,Labels
Entities,Unnamed: 1_level_1
Ford,266
Hyundai,207
Star News,191
Chevrolet,165
Toyota,162
Honda,147
Shropshire Star,108
Automotive News,108
BMW,108
Express & Star,103

Unnamed: 0_level_0,Labels
Entities,Unnamed: 1_level_1
MailOnline,925
COVID-19,644
Ford,643
Toyota,527
Instagram,406
Hyundai,400
Honda,390
Trump,378
BMW,375
Amazon,366

Unnamed: 0_level_0,Labels
Entities,Unnamed: 1_level_1
Land Rover,1041
Jaguar Land Rover,951
eBay,477
BMW,383
General Motors,292
"Mercedes-Benz, Citroen",285
Jaguar,168
Ford,116
Audi,100
Volvo,94
