In [286]:
import pandas as pd

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 500)

#### Read news data

In [287]:
news_path = 'https://storage.googleapis.com/msca-bdp-data-open/news/nlp_a_5_news.json'
news_df = pd.read_json(news_path, orient='records', lines=True)

print(f'Sample contains {news_df.shape[0]:,.0f} news articles')
news_df.head(2)

Sample contains 10,012 news articles


Unnamed: 0,url,date,language,title,text
0,http://kokomoperspective.com/obituaries/jon-w-horton/article_b6ba8e1e-cb9c-11eb-9868-fb11b88b9778.html,2021-06-13,en,Jon W. Horton | Obituaries | kokomoperspective.com,Jon W. Horton | Obituaries | kokomoperspective.comYou have permission to edit this article. EditCloseSign Up Log In Dashboard LogoutMy Account Dashboard Profile Saved items LogoutCOVID-19Click here for the latest local news on COVID-19HomeAbout UsContact UsNewsLocalOpinionPoliticsNationalStateAgricultureLifestylesEngagements/Anniversaries/WeddingsAutosEntertainmentHealthHomesOutdoorsSportsNFLNCAAVitalsObituariesAutomotivee-EditionCouponsGalleries74°...
1,https://auto.economictimes.indiatimes.com/news/auto-components/birla-precision-to-ramp-up-capacity-to-tap-emerging-opportunities-in-india/81254902,2021-02-28,en,"Birla Precision to ramp up capacity to tap emerging opportunities in India, Auto News, ET Auto","Birla Precision to ramp up capacity to tap emerging opportunities in India, Auto News, ET Auto We have updated our terms and conditions and privacy policy Click ""Continue"" to accept and continue with ET AutoAccept the updated privacy & cookie policyDear user, ET Auto privacy and cookie policy has been updated to align with the new data regulations in European Union. Please review and accept these changes below to continue using the website.You can see our privacy policy & our cookie ..."


#### Read Tweets data

In [288]:
tweets_path = 'https://storage.googleapis.com/msca-bdp-data-open/tweets/nlp_a_5_tweets.json'
tweets_df = pd.read_json(tweets_path, orient='records', lines=True)
print(f'Sample contains {tweets_df.shape[0]:,.0f} tweets')
tweets_df.head(2)

Sample contains 10,105 tweets


Unnamed: 0,id,lang,date,name,retweeted,text
0,1534565117614084096,en,2022-06-08,Low Orbit Tourist 🌍📷,,"Body &amp; Assembly - Halewood - United Kingdom\n🌍53.3504,-2.8352296,402m\n\nHalewood Body &amp; Assembly is a Jaguar Land Rover factory in Halewood, England, and forms the major part of the Halewood complex which is shared with Ford who manufacture transmissions at the site. [Wikipedia] https://t.co/LPmCnZIaVt"
1,1534565743429394439,en,2022-06-08,CompleteCar.ie,RT,"Land Rover Ireland has announced that the new Range Rover Sport starts at €114,150, now on @completecar:\n\nhttps://t.co/TjGUkL3FYr https://t.co/QdVaEiJkjO"


#### **Discarding Non English Results and keeping only relevant columns**

In [289]:
def discard_en(df,col):
    df = df[df[col]=="en"]
    

discard_en(news_df,"language")
discard_en(tweets_df,"lang")

In [290]:
news_df = news_df[["title","text"]]
tweets_df = tweets_df[["text"]]

#### **Applying appropriate cleaning methods**

In [291]:
import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
nlp = spacy.load("en_core_web_sm")
stopwords = set(nltk.corpus.stopwords.words('english'))

In [292]:
import multiprocessing
num_processors = multiprocessing.cpu_count()
print(f'Available CPUs: {num_processors}')

Available CPUs: 8


In [293]:
import re 
def cleaning(df,col):
    #df[col] = df[col].apply(lambda x: [word for word in nltk.tokenize.word_tokenize(x) if len(word)>1])
    #df[col] = df[col].apply(lambda x: [word for word in x if not word.isnumeric()])
    #df[col] = df[col].apply(lambda x: [word for word in x if word not in stopwords])
    #df[col] = df[col].apply(lambda x: " ".join([word for word in x]))
    df[col] = df[col].str.strip()
    df[col] = df[col].apply(lambda x: re.sub(r'\||\n|(@\w+.*?)|(http\w\S+.*?)|(#\w+)',' ',x))   

In [294]:
cleaning(news_df,"title")
cleaning(news_df,"text")
cleaning(tweets_df,"text")

I only cleaned the text by removing the whitespaces, URLs and Email addresses. I did not remove the stopwords, did not convert text to lower case and did not remove punctuations because it would cause problems in identifying appropriate entities and also during the sentence segmentation process. 

### **Finding top company name using NER-NTLK**

#### 1. Without Sentence Segmentation

In [218]:
def ner_nltk(df,col):
    ORG=[]
    for text in df[col]:
        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(text))):
            if hasattr(chunk,"label") and chunk.label()== 'ORGANIZATION':
                ORG.extend([c for c in chunk])
    
    org_counts = {}
    for org in ORG:
        if org[0] in org_counts:
            org_counts[org[0]] += 1
        else:
            org_counts[org[0]] = 1
    
    sorted_org = sorted(org_counts.items(), key=lambda x: x[1], reverse=True)
    top_20_orgs = sorted_org[:20]
    
    return(top_20_orgs)

#### **News Articles (Title)**

In [219]:
ner_nltk(news_df,"title")

[('News', 638),
 ('Star', 322),
 ('Online', 232),
 ('Mail', 228),
 ('Daily', 205),
 ('BMW', 111),
 ('Automotive', 106),
 ('Business', 101),
 ('Car', 101),
 ('Live', 97),
 ('Shropshire', 95),
 ('GMC', 86),
 ('Ontario', 84),
 ('UK', 77),
 ('Auto', 76),
 ('Volkswagen', 76),
 ('Express', 66),
 ('Land', 60),
 ('SUVs', 49),
 ('Rover', 48)]

#### **News Articles (Text)**

In [224]:
news_df_sample = news_df.sample(n=1000,random_state=420)

In [254]:
ner_nltk(news_df_sample,"text")

[('LA', 1357),
 ('NYC', 1286),
 ('News', 1078),
 ('Princess', 1012),
 ('MailOnline', 925),
 ('Prince', 912),
 ('Kate', 893),
 ('VERY', 732),
 ('Queen', 626),
 ('UK', 623),
 ('Duke', 613),
 ('Royal', 599),
 ('Awards', 598),
 ('Of', 531),
 ('House', 517),
 ('US', 494),
 ('COVID', 482),
 ('Land', 458),
 ('THE', 431),
 ('Duchess', 429)]

#### **Tweets**

In [221]:
ner_nltk(tweets_df,"text")

[('Land', 3441),
 ('Rover', 2796),
 ('BMW', 394),
 ('Discovery', 393),
 ('Motors', 314),
 ('General', 300),
 ('Jaguar', 294),
 ('LAND', 261),
 ('eBay', 223),
 ('UK', 204),
 ('Duke', 192),
 ('Duchess', 171),
 ('SHAMELESS', 157),
 ('Defender', 146),
 ('SUV', 136),
 ('Range', 119),
 ('Services', 114),
 ('Health', 106),
 ('Invictus', 94),
 ('ROVER', 89)]

#### **2. With Sentence Segmentation**

In [228]:
def sent_tokenizer(df,col):
    new_col_name = col + "_sent_tokens"
    df[new_col_name] = df[col].apply(lambda x : nltk.tokenize.sent_tokenize(x))


def ner_nltk_sent(df,col):
    ORG=[]
    for row in df[col]:
        for token in row:
            for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(token))):
                if hasattr(chunk,"label") and chunk.label()== 'ORGANIZATION':
                    ORG.extend([c for c in chunk])
                    
    org_counts = {}
    for org in ORG:
        if org[0] in org_counts:
            org_counts[org[0]] += 1
        else:
            org_counts[org[0]] = 1
    
    sorted_org = sorted(org_counts.items(), key=lambda x: x[1], reverse=True)
    top_20_orgs = sorted_org[:20]
    
    return(top_20_orgs)

In [225]:
sent_tokenizer(news_df,"title")
sent_tokenizer(news_df,"text")
sent_tokenizer(news_df_sample,"text")
sent_tokenizer(tweets_df,"text")

#### **News Articles (Title)**

In [229]:
ner_nltk_sent(news_df,"title_sent_tokens")

[('News', 638),
 ('Star', 322),
 ('Online', 232),
 ('Mail', 228),
 ('Daily', 205),
 ('BMW', 111),
 ('Automotive', 106),
 ('Business', 101),
 ('Car', 101),
 ('Live', 97),
 ('Shropshire', 95),
 ('GMC', 86),
 ('Ontario', 84),
 ('UK', 77),
 ('Auto', 76),
 ('Volkswagen', 76),
 ('Express', 66),
 ('Land', 60),
 ('SUVs', 49),
 ('Rover', 48)]

#### **News Articles (Text)**

In [256]:
ner_nltk_sent(news_df_sample,"text_sent_tokens")

[('LA', 1356),
 ('NYC', 1286),
 ('News', 1084),
 ('Princess', 1019),
 ('MailOnline', 925),
 ('Prince', 913),
 ('Kate', 868),
 ('VERY', 730),
 ('Queen', 627),
 ('UK', 623),
 ('Duke', 620),
 ('Royal', 600),
 ('Awards', 598),
 ('Of', 531),
 ('House', 523),
 ('US', 491),
 ('COVID', 483),
 ('Land', 465),
 ('Duchess', 416),
 ('Philip', 415)]

#### **Tweets**

In [232]:
ner_nltk_sent(tweets_df,"text_sent_tokens")

[('Land', 3448),
 ('Rover', 2846),
 ('Discovery', 394),
 ('BMW', 390),
 ('Motors', 314),
 ('Jaguar', 302),
 ('General', 300),
 ('LAND', 262),
 ('eBay', 223),
 ('UK', 204),
 ('Duke', 192),
 ('Duchess', 171),
 ('SHAMELESS', 157),
 ('Defender', 146),
 ('SUV', 136),
 ('Range', 119),
 ('Services', 114),
 ('Health', 106),
 ('Invictus', 94),
 ('ROVER', 90)]

From the above tables, we can infer that the NER-NLTK model does not perform well as it identifies a lot of other things such as News, NYC, LA, Online and other words as Organizations which is not the case. 

However, for the tweets, it performs slightly better as it identifies companies such as Land Rover (but seperately), BMW and Jaguar. 

The results are also slightly better in the case of sentence segmentation as the model identifies more counts of company names. 

### **Finding top company name using NER SpaCy**

#### 1. Without Sentence Segmentation

In [241]:
def ner_spacy(df,col):
    entities=[]
    labels=[]
    for i in df[col]:
        doc=nlp(i)
        for ent in doc.ents:
            if ent.label_ == "ORG":
                entities.append(ent.text)
                labels.append(ent.label_)
    ent_df = pd.DataFrame({'Entities':entities,'Labels':labels})
    ent_gpd = ent_df.groupby("Entities").count().sort_values(by="Labels",ascending=False).head(20)
    
    return ent_gpd

In [302]:
def ner_spacy(df,col):
    entities=[]
    labels=[]
    for doc in nlp.pipe(texts = df.loc[:,col], n_process = num_processors-5,batch_size=300):
        for ent in doc:
            if ent.ent_type_ == "ORG":
                entities.append(ent.text)
                labels.append(ent.ent_type_)
    ent_df = pd.DataFrame({'Entities':entities,'Labels':labels})
    ent_gpd = ent_df.groupby("Entities").count().sort_values(by="Labels",ascending=False).head(20)
    
    return ent_gpd

#### **News Articles (Title)**

In [300]:
ner_spacy(news_df,"title")

Unnamed: 0_level_0,Labels
Entities,Unnamed: 1_level_1
News,749
-,619
Star,439
Ford,362
&,229
Hyundai,212
Auto,206
Daily,195
Chevrolet,169
Toyota,167


#### **News Articles (Text)**

In [304]:
%time ner_spacy(news_df_sample,"text")

CPU times: user 27.4 s, sys: 5.68 s, total: 33.1 s
Wall time: 10min 52s


Unnamed: 0_level_0,Labels
Entities,Unnamed: 1_level_1
the,3697
&,3301
-,2123
's,1809
The,1490
of,1193
',1183
Royal,997
News,946
MailOnline,925


#### **Tweets (Text)**

In [243]:
ner_spacy(tweets_df,"text")

Unnamed: 0_level_0,Labels
Entities,Unnamed: 1_level_1
Land Rover,1041
Jaguar Land Rover,940
eBay,477
BMW,383
General Motors,292
"Mercedes-Benz, Citroen",285
Jaguar,175
Ford,116
Audi,99
Volvo,93


#### 2. With Sentence Segmentation

In [245]:
def ner_spacy_sent(df,col):
    entities=[]
    labels=[]
    for row in df[col]:
        for token in row:
            doc=nlp(token)
            for ent in doc.ents:
                if ent.label_ == "ORG":
                    entities.append(ent.text)
                    labels.append(ent.label_)
    ent_df = pd.DataFrame({'Entities':entities,'Labels':labels})
    ent_gpd = ent_df.groupby("Entities").count().sort_values(by="Labels",ascending=False).head(20)
    
    return ent_gpd

#### **News Articles (Title)**

In [246]:
ner_spacy_sent(news_df,"title_sent_tokens")

Unnamed: 0_level_0,Labels
Entities,Unnamed: 1_level_1
Ford,259
Hyundai,203
Chevrolet,164
Toyota,163
Star News,160
Honda,144
Winnipeg Manitoba Carpages.ca,137
Toronto Ontario Carpages.ca,113
Automotive News,98
BMW,98


#### **News Articles (Text)**

In [258]:
ner_spacy_sent(news_df_sample,"text_sent_tokens")

Unnamed: 0_level_0,Labels
Entities,Unnamed: 1_level_1
MailOnline,925
COVID-19,644
Ford,643
Toyota,527
Instagram,406
Hyundai,400
Honda,390
Trump,378
BMW,375
Amazon,366


#### **Tweets**

In [247]:
ner_spacy_sent(tweets_df,"text_sent_tokens")

Unnamed: 0_level_0,Labels
Entities,Unnamed: 1_level_1
Land Rover,1041
Jaguar Land Rover,951
eBay,477
BMW,383
General Motors,292
"Mercedes-Benz, Citroen",285
Jaguar,168
Ford,116
Audi,100
Volvo,94


We can see that the SpaCy model performs very well when compared to the NER-NLTK model as it correctly identifies the company names with a few exceptions. We can also see that the model with sentence segmentation performs better than the one where only word tokenizaton is done. This is because in the sentence segmention, the model correctly identifies the context in more cases. 

**For the titles in news articles, the most frequently mentioned company is Ford and other companies that are mentioned along with Ford are Hyundai, Chevrolet, Toyota and Honda.**

**For the text in news articles, the most frequently mentioned company is Ford and other companies that are mentioned along with Ford are Toyota, Hyundia, Honda and BMW.**

**For the tweets, the most frequently mentioned company is Land Rover and other companies that are mentioned along with Land Rover are Jaquar, BMW, General Motors, Mercedez-Benz and Ford.**

### **Finding top Locations using NER NLTK**

In [248]:
def ner_nltk_loc(df,col):
    LOC=[]
    for text in df[col]:
        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(text))):
            if hasattr(chunk,"label") and chunk.label()== 'GPE':
                LOC.extend([c for c in chunk])
    
    loc_counts = {}
    for loc in LOC:
        if loc[0] in loc_counts:
            loc_counts[loc[0]] += 1
        else:
            loc_counts[loc[0]] = 1
    
    sorted_loc = sorted(loc_counts.items(), key=lambda x: x[1], reverse=True)
    top_20_locs = sorted_loc[:20]
    
    return(top_20_locs)

#### **News Articles (Title)**

In [249]:
ner_nltk_loc(news_df,"title")

[('Sale', 1966),
 ('British', 195),
 ('New', 180),
 ('Prince', 157),
 ('Winnipeg', 137),
 ('India', 121),
 ('Toronto', 118),
 ('London', 112),
 ('North', 111),
 ('China', 108),
 ('York', 99),
 ('Cambridge', 91),
 ('Calgary', 63),
 ('U.S.', 63),
 ('Taiwan', 56),
 ('Mississauga', 55),
 ('Land', 53),
 ('Kitchener', 45),
 ('Oakville', 42),
 ('Innisfil', 41)]

#### **News Articles (Text)**

In [259]:
ner_nltk_loc(news_df_sample,"text")

[('New', 2103),
 ('York', 1571),
 ('London', 1198),
 ('Los', 1128),
 ('Angeles', 1044),
 ('City', 944),
 ('British', 794),
 ('West', 701),
 ('India', 533),
 ('Prince', 517),
 ('South', 427),
 ('Miami', 425),
 ('Australia', 419),
 ('California', 412),
 ('Mexico', 389),
 ('China', 386),
 ('American', 380),
 ('Malibu', 378),
 ('U.S.', 363),
 ('Hollywood', 360)]

#### **Tweets**

In [250]:
ner_nltk_loc(tweets_df,"text")

[('Land', 1562),
 ('Russia', 180),
 ('British', 156),
 ('New', 140),
 ('Jaguar', 128),
 ('Sussex', 120),
 ('India', 92),
 ('Zimbabwe', 86),
 ('Ad', 83),
 ('Russian', 77),
 ('Audi', 70),
 ('Car', 69),
 ('Cambridge', 68),
 ('Britain', 64),
 ('Meghan', 64),
 ('Paracetamol', 64),
 ('LAND', 63),
 ('UPDATE', 54),
 ('Indian', 53),
 ('Netherlands', 52)]

### **Finding top Locations using NER SpaCy**

In [251]:
def ner_spacy_loc(df,col):
    entities=[]
    labels=[]
    for i in df[col]:
        doc=nlp(i)
        for ent in doc.ents:
            if ent.label_ == "GPE":
                entities.append(ent.text)
                labels.append(ent.label_)
    ent_df = pd.DataFrame({'Entities':entities,'Labels':labels})
    ent_gpd = ent_df.groupby("Entities").count().sort_values(by="Labels",ascending=False).head(20)
    
    return ent_gpd

#### **News Articles (Title)**

In [252]:
ner_spacy_loc(news_df,"title")

Unnamed: 0_level_0,Labels
Entities,Unnamed: 1_level_1
UK,179
India,94
US,85
Alberta,68
Calgary,64
North York,59
U.S.,58
Taiwan,53
China,52
Russia,42


#### **News Articles (Text)**

In [260]:
ner_spacy_loc(news_df_sample,"text")

Unnamed: 0_level_0,Labels
Entities,Unnamed: 1_level_1
LA,1872
London,1258
UK,1221
US,1039
Los Angeles,997
New York City,739
Hollywood,595
India,550
Miami,476
Meghan,475


#### **Tweets**

In [253]:
ner_spacy_loc(tweets_df,"text")

Unnamed: 0_level_0,Labels
Entities,Unnamed: 1_level_1
UK,342
Russia,184
India,95
Meghan,78
Kibaki,76
Britain,69
Jamaica,59
Netherlands,45
💸,42
Zimbabwe,41


We can see that the NER-SpaCy model performs much better than the NER-NLTK model in identifying locations. The NER-NLK model falsly identifies a lot of entities. For eg: it identifies "Land" as a location but in reality it refers to Land Rover which is a company. Similarly, it has a few other entities which are falsly identified. Whereas, the SpaCy model identifies locations correctly which very few exceptions. 

**For Titles in News Articles, the most frequenctly mentioned location is UK followed by India, US and Alberta.**

**For Text in News Articles, the most frequenctly mentioned location is LA followed by London,UK and US.**

**For the tweets, the most frequenctly mentioned location is UK followed by Russia, India and Britain.**

## **Top-20 List of the best performing model** 

In [264]:
from IPython.display import display_html
from itertools import chain,cycle
def display_side_by_side(*args,titles=cycle([''])):
    html_str=''
    for df,title in zip(args, chain(titles,cycle(['</br>'])) ):
        html_str+='<th style="text-align:center"><td style="vertical-align:top">'
        html_str+=f'<h2 style="text-align: center;">{title}</h2>'
        html_str+=df.to_html().replace('table','table style="display:inline"')
        html_str+='</td></th>'
    display_html(html_str,raw=True)

### **1. For Company Name : NER SpaCy Model with Segmentation provides best results**

In [265]:
display_side_by_side(ner_spacy_sent(news_df,"title_sent_tokens"),ner_spacy_sent(news_df_sample,"text_sent_tokens")
                     ,ner_spacy_sent(tweets_df,"text_sent_tokens")
                     , titles=['News Articles(Title)','News Articles(Text)','Tweets'])

Unnamed: 0_level_0,Labels
Entities,Unnamed: 1_level_1
Ford,259
Hyundai,203
Chevrolet,164
Toyota,163
Star News,160
Honda,144
Winnipeg Manitoba Carpages.ca,137
Toronto Ontario Carpages.ca,113
Automotive News,98
BMW,98

Unnamed: 0_level_0,Labels
Entities,Unnamed: 1_level_1
MailOnline,925
COVID-19,644
Ford,643
Toyota,527
Instagram,406
Hyundai,400
Honda,390
Trump,378
BMW,375
Amazon,366

Unnamed: 0_level_0,Labels
Entities,Unnamed: 1_level_1
Land Rover,1041
Jaguar Land Rover,951
eBay,477
BMW,383
General Motors,292
"Mercedes-Benz, Citroen",285
Jaguar,168
Ford,116
Audi,100
Volvo,94


**For the titles in news articles, the most frequently mentioned company is Ford and other companies that are mentioned along with Ford are Hyundai, Chevrolet, Toyota and Honda.**

**For the text in news articles, the most frequently mentioned company is Ford and other companies that are mentioned along with Ford are Toyota, Hyundia, Honda and BMW.**

**For the tweets, the most frequently mentioned company is Land Rover and other companies that are mentioned along with Land Rover are Jaquar, BMW, General Motors, Mercedez-Benz and Ford.**

### **1. For Location : NER SpaCy Model provides best results**

In [266]:
display_side_by_side(ner_spacy_loc(news_df,"title"),ner_spacy_loc(news_df_sample,"text"),
                     ner_spacy_loc(tweets_df,"text"), titles=['News Articles(Title)','News Articles(Text)','Tweets'])

Unnamed: 0_level_0,Labels
Entities,Unnamed: 1_level_1
UK,179
India,94
US,85
Alberta,68
Calgary,64
North York,59
U.S.,58
Taiwan,53
China,52
Russia,42

Unnamed: 0_level_0,Labels
Entities,Unnamed: 1_level_1
LA,1872
London,1258
UK,1221
US,1039
Los Angeles,997
New York City,739
Hollywood,595
India,550
Miami,476
Meghan,475

Unnamed: 0_level_0,Labels
Entities,Unnamed: 1_level_1
UK,342
Russia,184
India,95
Meghan,78
Kibaki,76
Britain,69
Jamaica,59
Netherlands,45
💸,42
Zimbabwe,41


**For Titles in News Articles, the most frequenctly mentioned location is UK followed by India, US and Alberta.**

**For Text in News Articles, the most frequenctly mentioned location is LA followed by London,UK and US.**

**For the tweets, the most frequenctly mentioned location is UK followed by Russia, India and Britain.**