### Step 1: Setup Environment

In [2]:
# Double Checking I have everything installed

import pandas as pd
import spacy
from spacy.pipeline import EntityRuler
!pip install spacy pandas
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


### Step 2: Load the Dataset

In [3]:
# Load the Dataset and separate the lines
df = pd.read_csv('stocks-1.tsv', sep='\t')

#Display the first few rows to identify columns containing company names and stock symbols
df.head()

Unnamed: 0,Symbol,CompanyName,Industry,MarketCap
0,A,Agilent Technologies,Life Sciences Tools & Services,53.65B
1,AA,Alcoa,Metals & Mining,9.25B
2,AAC,Ares Acquisition,Shell Companies,1.22B
3,AACG,ATA Creativity Global,Diversified Consumer Services,90.35M
4,AADI,Aadi Bioscience,Pharmaceuticals,104.85M


### Step 3: Extract Data for Patterns

In [4]:
# Extract unique company names and stock symbols
company_names = df["CompanyName"].dropna().unique()
stock_symbols = df["Symbol"].dropna().unique()

# Create an empty list to hold all the pattern dictionaries
patterns = []

# Loo through each unique company name
for name in company_names:

    # Add a pattern dictionary for this company with the label "COMPANY"
    patterns.append({'label': "COMPANY", 'pattern': name})

# Loop through each unique stock symbol
for symbol in stock_symbols:

    # Add a pattern dictionary for this stock with the label "STOCK"
    patterns.append({'label': "STOCK", 'pattern': symbol})

### Step 4: Create EntityRuler

In [5]:
# Load spacy model
nlp = spacy.load("en_core_web_sm")

# Add EntityRuler to the pipeline
ruler = nlp.add_pipe("entity_ruler",before="ner")

# Add patterns
ruler.add_patterns(patterns)


### Step 5: Test the EntityRuler

In [None]:
# Setting up the paragraphs

paragraph1 = """Helmerich & Payne (HP) saw its stock rise by 1.5%, fueled by 
optimistic forecasts in the Energy Equipment & Services sector. In contrast, 
Check-Cap (CHEK) faced a decline of 2.3% following its announcement of increased costs related to supply chain disruptions.

Meanwhile, Vallon Pharmaceuticals (VLON) gained 0.8% after strong quarterly earnings, 
outperforming its peers in the Biotechnology space. Sequans Communications (SQNS) 
also recorded a modest increase of 0.5%, reflecting investors' confidence in its 
ability to navigate challenges in the Semiconductors & Semiconductor Equipment industry."""

paragraph2 = """Aemetis (AMTX) saw its stock rise by 1.5%, fueled by optimistic 
forecasts in the Oil, Gas & Consumable Fuels sector. In contrast, Ferro Corporation
 (FOE) faced a decline of 2.3% following its announcement of increased costs related to supply chain disruptions.

Meanwhile, RingCentral (RNG) gained 0.8% after strong quarterly earnings, outperforming
 its peers in the Software space. ACI Worldwide (ACIW) also recorded a
   modest increase of 0.5%, reflecting investors' confidence in its ability to navigate challenges in the Software industry.""" 

paragraph3 ="""On a mixed trading day, Par Pacific Holdings (PARR) saw its stock 
rise by 1.5%, fueled by optimistic forecasts in the Oil, Gas & Consumable Fuels sector. 
In contrast, Nano Dimension (NNDM) faced a decline of 2.3% following its announcement
 of increased costs related to supply chain disruptions.

Meanwhile, Beyond Meat (BYND) gained 0.8% after strong quarterly earnings,
 outperforming its peers in the Food Products space. Apollo Investment (AINV)
   also recorded a modest increase of 0.5%, reflecting investors' confidence in 
   its ability to navigate challenges in the Capital Markets industry.
"""
# Making them into a list
paragraphs = [paragraph1, paragraph2, paragraph3]

In [19]:
# Loop through each paragraph and extract custom entities
for i, para in enumerate(paragraphs, 1):

    # Run the paragraph through the spacy NLP pipeline
    doc = nlp(para)

    # Print a header for the current paragraph
    print(f"\n=== Entities in Paragraph {i} ===")

    # Loop through all recgonized entities in the paragaph
    for ent in doc.ents:
        
        # Only print entities that match our custom labels: COMPANY or STOCK
        if ent.label_ in ["COMPANY", "STOCK"]:

            # Print the entity text and its label
            print(f"{ent.text} ({ent.label_})")


=== Entities in Paragraph 1 ===
Helmerich & Payne (COMPANY)
HP (STOCK)
Check-Cap (COMPANY)
CHEK (STOCK)
Vallon Pharmaceuticals (COMPANY)
VLON (STOCK)
Sequans Communications (COMPANY)
SQNS (STOCK)

=== Entities in Paragraph 2 ===
Aemetis (COMPANY)
AMTX (STOCK)
Ferro Corporation (COMPANY)
FOE (STOCK)
RingCentral (COMPANY)
RNG (STOCK)
ACI Worldwide (COMPANY)
ACIW (STOCK)

=== Entities in Paragraph 3 ===
Par Pacific Holdings (COMPANY)
PARR (STOCK)
Nano Dimension (COMPANY)
NNDM (STOCK)
Beyond Meat (COMPANY)
BYND (STOCK)
Apollo Investment (COMPANY)
AINV (STOCK)
