In [1]:
import spacy
import pandas as pd

In [2]:
nlp = spacy.load('en_core_web_sm')

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
df = pd.read_csv('reddit_investing.csv', sep='|' )
df.head()

Unnamed: 0,name,created_utc,subreddit,title,selftext,upvote_ratio,ups,downs,score
0,t3_157aure,1690107000.0,investing,Honest opinions? Good idea/bad idea,"Hi all,\n\nAbout to lock in £20,000 (yearly ta...",1.0,2.0,0.0,2.0
1,t3_1579rid,1690103000.0,investing,Daily General Discussion and Advice Thread - J...,Have a general question? Want to offer some c...,1.0,1.0,0.0,1.0
2,t3_15791ul,1690100000.0,investing,Thoughts about TSLA as a long term investment.,Would love to hear your thoughts about TSLA as...,0.5,0.0,0.0,0.0
3,t3_1573gnn,1690082000.0,investing,Warren buffet quote y’all might like,"""It is a terrible mistake for investors with l...",0.78,41.0,0.0,41.0
4,t3_15714rh,1690075000.0,investing,How can I make investing into the stock market...,Hello I’ve recently been researching about inv...,0.2,0.0,0.0,0.0


### Function for extracting entities in each chunk of text

In [5]:
def get_orgs(text):
    doc = nlp(text)
    org_list = []
    for entity in doc.ents:
        if entity.label_ == 'ORG':
            org_list.append(entity.text)
    org_list = list(set(org_list))
    return org_list




In [6]:
# applying the get_orgs function to every selftext in the df
df['organizations'] = df['selftext'].apply(get_orgs)

In [8]:
df.head()

Unnamed: 0,name,created_utc,subreddit,title,selftext,upvote_ratio,ups,downs,score,organizations
0,t3_157aure,1690107000.0,investing,Honest opinions? Good idea/bad idea,"Hi all,\n\nAbout to lock in £20,000 (yearly ta...",1.0,2.0,0.0,2.0,[ISA]
1,t3_1579rid,1690103000.0,investing,Daily General Discussion and Advice Thread - J...,Have a general question? Want to offer some c...,1.0,1.0,0.0,1.0,"[FAQ, wiki]"
2,t3_15791ul,1690100000.0,investing,Thoughts about TSLA as a long term investment.,Would love to hear your thoughts about TSLA as...,0.5,0.0,0.0,0.0,[]
3,t3_1573gnn,1690082000.0,investing,Warren buffet quote y’all might like,"""It is a terrible mistake for investors with l...",0.78,41.0,0.0,41.0,[]
4,t3_15714rh,1690075000.0,investing,How can I make investing into the stock market...,Hello I’ve recently been researching about inv...,0.2,0.0,0.0,0.0,[]


#### Extracting the most mentioned entities

In [9]:
from collections import Counter

In [10]:
# reformat data into a single list for Counter()
orgs = df['organizations'].to_list()
orgs = [org for sublist in orgs for org in sublist]


In [12]:
org_freq = Counter(orgs) # needs a single list containing all orgs mentions

In [13]:
org_freq.most_common(10)

[('FAQ', 47),
 ('wiki', 47),
 ('VOO', 45),
 ('Fidelity', 44),
 ('Vanguard', 18),
 ('Fed', 18),
 ('SPY', 17),
 ('AI', 13),
 ('HSA', 12),
 ('QQQ', 12)]

In [20]:
# Creating a blacklist to view the most mentioned stocks
blacklist = ['voo', 'qqq', 'fed', 'vanguard']

In [21]:
def get_orgs(text):
    doc = nlp(text)
    org_list = []
    for entity in doc.ents:
        if entity.label_ == 'ORG' and entity.text.lower() not in blacklist:
            org_list.append(entity.text)
    org_list = list(set(org_list))
    return org_list




In [22]:
# applying the get_orgs function to every selftext in the df
df['organizations'] = df['selftext'].apply(get_orgs)

In [23]:
# reformat data into a single list for Counter()
orgs = df['organizations'].to_list()
orgs = [org for sublist in orgs for org in sublist]


In [24]:
org_freq = Counter(orgs) # needs a single list containing all orgs mentions
org_freq.most_common(10)

[('FAQ', 47),
 ('wiki', 47),
 ('Fidelity', 44),
 ('SPY', 17),
 ('AI', 13),
 ('HSA', 12),
 ('fidelity', 11),
 ('Apple', 10),
 ('Tesla', 9),
 ('Microsoft', 9)]

In [25]:
df.to_csv('ner_reddit_investing.csv', sep='|', index=False)