In [1]:
import nltk
from nltk import sent_tokenize, word_tokenize, pos_tag, ne_chunk
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\JAINAM\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\JAINAM\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\JAINAM\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\JAINAM\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

### NER using NLTK

In [10]:
import markdown

# Read the contents of the .md file
with open('../../data/hr_docs_filled/employee_contracts/employee_contract_Ava Thomas.md', 'rb') as f:
    md_text = f.read()

# Convert the markdown text to HTML
text = markdown.markdown(md_text)

In [11]:
sentences = sent_tokenize(text)
words = [word_tokenize(sentence) for sentence in sentences]

In [12]:
pos_tags = [pos_tag(sentence) for sentence in words]

In [13]:
named_entities = [ne_chunk(tagged_sentence) for tagged_sentence in pos_tags]

In [14]:
named_entities

[Tree('S', [('<', 'JJ'), ('p', 'NN'), ('>', 'NNP'), ("b'\\r\\n", 'NN'), ('#', '#'), ('Employment', 'NNP'), ('Contract\\r\\n\\r\\n', 'NNP'), ('#', '#'), ('#', '#'), ('Employee', 'NNP'), ('Information', 'NN'), (':', ':'), ('\\r\\n-', 'JJ'), ('<', 'NN'), ('strong', 'JJ'), ('>', 'NNP'), ('Employee', 'NNP'), ('Name', 'NN'), (':', ':'), ('<', 'NN'), ('/strong', 'IN'), ('>', 'JJ'), ('Ava', 'NNP'), ('Thomas\\r\\n-', 'NNP'), ('<', 'NNP'), ('strong', 'JJ'), ('>', 'JJ'), ('Position', 'NN'), (':', ':'), ('<', 'NN'), ('/strong', 'IN'), ('>', 'JJ'), Tree('ORGANIZATION', [('Administrative', 'JJ')]), ('Assistant\\r\\n-', 'NNP'), ('<', 'NNP'), ('strong', 'JJ'), ('>', 'NNP'), ('Department', 'NNP'), (':', ':'), ('<', 'NN'), ('/strong', 'IN'), ('>', 'NNP'), ('Administration\\r\\n\\r\\n', 'NNP'), ('#', '#'), ('#', '#'), ('Terms', 'NNS'), ('and', 'CC'), ('Conditions', 'NNS'), (':', ':'), ('\\r\\n-', 'JJ'), ('<', 'NN'), ('strong', 'JJ'), ('>', 'JJ'), ('Responsibilities', 'NNS'), (':', ':'), ('<', 'NN'), ('/s

In [15]:
for ne_tree in named_entities:
    for subtree in ne_tree:
        if type(subtree) == nltk.Tree:
            entity = " ".join([token for token, pos in subtree.leaves()])
            entity_type = subtree.label()
            print(f"Entity: {entity}, Type: {entity_type}")

Entity: Administrative, Type: ORGANIZATION
Entity: Confidentiality, Type: ORGANIZATION
Entity: Company, Type: ORGANIZATION
Entity: Compliance, Type: ORGANIZATION
Entity: Instagram, Type: GPE
Entity: Ava Thomas, Type: PERSON
Entity: Employee, Type: ORGANIZATION
Entity: Employer, Type: ORGANIZATION


### NER Using Spacy

In [16]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [17]:
from pprint import pprint
doc = nlp(text)
pprint([(X.text, X.label_) for X in doc.ents])

[('#', 'CARDINAL'),
 ('##', 'MONEY'),
 ('## Terms', 'MONEY'),
 ('65,000', 'MONEY'),
 ('Hours:</strong', 'GPE'),
 ('40 hours', 'TIME'),
 ('30', 'CARDINAL'),
 ('Intellectual Property', 'PERSON'),
 ('Ava Thomas', 'PERSON'),
 ('October 1', 'DATE'),
 ('Employee', 'PRODUCT'),
 ('the start date of', 'DATE'),
 ('October 1, 2022', 'DATE'),
 ('Employer', 'LOC')]


In [18]:
def entity_recognition(location):
    with open(location, 'rb') as f:
        md_text = f.read()
    text = markdown.markdown(md_text)
    doc = nlp(text)
    return [(X.text, X.label_) for X in doc.ents]

In [20]:

entity_recognition('../../data/business_docs/Marketing Plan.md')

[('### Company Name', 'MONEY'),
 ('### Category', 'MONEY'),
 ('### Document', 'MONEY'),
 ('####', 'MONEY'),
 ('Target Audience:</strong>\\r\\n- Small', 'ORG'),
 ('campaigns.\\r\\n- Drive', 'FAC'),
 ('20,000', 'MONEY'),
 ('Budget', 'ORG'),
 ('3-month', 'DATE'),
 ('Monthly', 'DATE'),
 ('Monthly', 'DATE'),
 ('####', 'MONEY'),
 ('Identification', 'ORG'),
 ('Target Audience:</strong>\\r\\n- Fashion', 'ORG'),
 ('partnerships.\\r\\n- Brands', 'ORG'),
 ('metrics.\\r\\n- Co-creation', 'PERSON'),
 ('15,000', 'MONEY'),
 ('Influencer', 'PERSON'),
 ('2-month', 'DATE'),
 ('engagement.\\r\\n\\r\\n', 'PERSON'),
 ('####', 'MONEY'),
 ('Strategies:</strong>\\r\\n- Tailored', 'ORG'),
 ('insights.\\r\\n- Implementing', 'PERSON')]

In [21]:
entity_recognition('../../data/business_docs/Business Proposal.md')

[('### Company Name', 'MONEY'),
 ('### Category', 'MONEY'),
 ('####', 'MONEY'),
 ('Social Media Marketing Services</strong>\\r\\n\\r\\n', 'ORG'),
 ('Problem Statement:</strong>\\r\\nMany', 'PERSON'),
 ('####', 'MONEY'),
 ('Instagram', 'ORG'),
 ('engagement.\\r\\n\\r\\n', 'PERSON'),
 ('####', 'MONEY'),
 ('3', 'CARDINAL'),
 ('Instagram', 'ORG'),
 ('engagement.\\r\\n\\r\\n', 'PERSON')]

In [22]:
entity_recognition('../../data/company_bylaws/Board of Directors.md')

[('Board of Directors\\r\\n\\r\\nThe Board of Directors', 'ORG'),
 ('Instagram.\\r\\n\\r\\n## Board Composition\\r\\n\\r\\nThe Board of Directors of Instagram',
  'ORG'),
 ('members:\\r\\n\\r\\n1', 'LOC'),
 ('John Smith</strong>', 'PERSON'),
 ('Board', 'ORG'),
 ('John Smith', 'PERSON'),
 ('20 years', 'DATE'),
 ('Board', 'ORG'),
 ('2015', 'DATE'),
 ('Instagram', 'ORG'),
 ('Sarah Johnson</strong', 'PERSON'),
 ('Sarah Johnson', 'PERSON'),
 ('Sarah', 'PERSON'),
 ('Instagram', 'GPE'),
 ('David Lee</strong', 'PERSON'),
 ('David Lee', 'PERSON'),
 ('David', 'PERSON'),
 ('Emily Davis', 'PERSON'),
 ('Instagram', 'ORG'),
 ('## Contributions', 'MONEY'),
 ('Instagram', 'ORG'),
 ('the years', 'DATE'),
 ('Guiding Instagram', 'ORG'),
 ('Stories', 'ORG'),
 ('Reels', 'PRODUCT'),
 ('Instagram', 'ORG'),
 ('over 1 billion', 'MONEY'),
 ('monthly', 'DATE'),
 ('globally.\\r\\n\\r\\nThe Board of Directors', 'ORG'),
 ('Instagram', 'ORG'),
 ('## Contact', 'MONEY'),
 ('the Board of Directors', 'ORG')]

In [23]:
with open('../../data/company_bylaws/Board of Directors.md', 'rb') as f:
        md_text = f.read()
text = markdown.markdown(md_text)
article = nlp(text)
len(article.ents)

32

In [24]:
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'ORG': 13,
         'PERSON': 9,
         'DATE': 4,
         'MONEY': 3,
         'LOC': 1,
         'GPE': 1,
         'PRODUCT': 1})

In [25]:
items = [x.text for x in article.ents]
Counter(items).most_common(3)

[('Instagram', 6),
 ('Board', 2),
 ('Board of Directors\\r\\n\\r\\nThe Board of Directors', 1)]

In [28]:
sentences = [x for x in article.sents]
print(sentences[6])

Sarah Johnson</strong> - CEO, Instagram<br>\r\n   Sarah Johnson is the Chief Executive Officer of Instagram, responsible for driving the company's overall growth and success.


In [29]:
displacy.render(nlp(str(sentences[6])), jupyter=True, style='ent')

### Custom NER model using Spacy

In [1]:
l = [("This Employee Payslip is issued to Emily Brown for the pay period from April 1, 2022 to April 15, 2022. It provides a detailed breakdown of earnings, deductions, and the net pay for the specified period.",{"entities":[(35,46,"Person")]}),
("Net Pay: $3,930.00 ",{"entities":[(9,18,"Money")]}),
("Net Pay: ",{"entities":[]}),
("Total Deductions: $1,170.00 ",{"entities":[(18,27,"Money")]}),
("Other Deductions: $90.00 ",{"entities":[(18,24,"Money")]}),
("Insurance Contributions: $180.00 ",{"entities":[(25,32,"Money")]}),
("Taxes: $900.00 ",{"entities":[(7,14,"Money")]}),
("Deductions: ",{"entities":[]}),
("Total Earnings: $5,100.00 ",{"entities":[(16,25,"Money")]}),
("Bonuses: $900.00 ",{"entities":[(9,16,"Money")]}),
("Allowances: $450.00 ",{"entities":[(12,19,"Money")]}),
("Basic Salary: $3,750.00 ",{"entities":[(14,23,"Money")]}),
("Earnings: ",{"entities":[]}),
("End Date: April 15, 2022 ",{"entities":[]}),
("Start Date: April 1, 2022 ",{"entities":[]}),
("Pay Period: ",{"entities":[]}),
("Department: Design ",{"entities":[]}),
("Position: UX Designer ",{"entities":[]}),
("Employee ID: 12348 ",{"entities":[]}),
("Employee Name: Emily Brown ",{"entities":[(15,26,"Person")]}),
("Employee Information: ",{"entities":[]}),
("Employee Payslip ",{"entities":[]}),
("This contract outlines the terms and conditions of employment for the Employee and is effective as of the start date of February 1, 2022. Both the Employer and Employee agree to abide by the terms set forth in this contract.",{"entities":[]}),
(" ",{"entities":[]}),
("This employment contract is entered into between Instagram (hereinafter referred to as Employer) and Jane Smith (hereinafter referred to as Employee) on February 1, 2022. ",{"entities":[(49,58,"Organization"),(101,111,"Person")]}),
("Compliance with Company Policies: Compliance required ",{"entities":[]}),
("Intellectual Property Rights: Company-owned ",{"entities":[]}),
("Confidentiality Agreement: Yes ",{"entities":[]}),
("Termination Conditions: 30 days' notice ",{"entities":[]}),
("Work Hours: 40 hours per week ",{"entities":[]}),
("Benefits: Health insurance, retirement plan ",{"entities":[]}),
("Compensation: $80,000 per year ",{"entities":[(14,30,"Money")]}),
("Responsibilities: Developing and executing marketing campaigns. ",{"entities":[]}),
("Terms and Conditions: ",{"entities":[]}),
("Department: Marketing ",{"entities":[]}),
("Position: Marketing Specialist ",{"entities":[]}),
("Employee Name: Jane Smith ",{"entities":[(15,25,"Person")]}),
("Employee Information: ",{"entities":[]}),
("Employment Contract ",{"entities":[]}),
("Total Liabilities and Shareholders' Equity $450,000",{"entities":[(43,51,"Money")]}),
("Retained Earnings $200,000 ",{"entities":[(18,26,"Money")]}),
("Common Stock $100,000 ",{"entities":[(13,21,"Money")]}),
("Shareholders' Equity ",{"entities":[]}),
("Bonds Payable $50,000 ",{"entities":[(14,21,"Money")]}),
("Long-term Loans $100,000 ",{"entities":[(16,24,"Money")]}),
("Non-Current Liabilities $150,000 ",{"entities":[(24,32,"Money")]}),
("Short-term Loans $50,000 ",{"entities":[(17,24,"Money")]}),
("Accounts Payable $50,000 ",{"entities":[(17,24,"Money")]}),
("Current Liabilities $100,000 ",{"entities":[(20,28,"Money")]}),
("Liabilities ",{"entities":[]}),
("Total Assets $450,000 ",{"entities":[(13,21,"Money")]}),
("Intangible Assets $50,000 ",{"entities":[(18,25,"Money")]}),
("Property, Plant, and Equipment $150,000 ",{"entities":[(31,39,"Money")]}),
("Non-Current Assets $200,000 ",{"entities":[(19,27,"Money")]}),
("Prepaid Expenses $50,000 ",{"entities":[(17,24,"Money")]}),
("Inventory $50,000 ",{"entities":[(10,17,"Money")]}),
("Accounts Receivable $100,000 ",{"entities":[(20,28,"Money")]}),
("Cash $50,000 ",{"entities":[(5,12,"Money")]}),
("Current Assets $250,000 ",{"entities":[(15,23,"Money")]}),
("Assets ",{"entities":[]}),
("Category Amount ",{"entities":[]}),
(" ",{"entities":[]}),
("As of Date: December 31, 20XX ",{"entities":[]}),
(" ",{"entities":[]}),
("Company Name: XYZ Corporation ",{"entities":[(14,29,"Organization")]}),
("Balance Sheet ",{"entities":[]}),
("Email: shareholders@instagram.com Phone: +1-123-456-7890",{"entities":[(7,33,"Email Id"),(41,56,"Contact_Number")]}),
(" ",{"entities":[]}),
("For any inquiries or further information about our shareholders, please feel free to contact us. ",{"entities":[]}),
(" ",{"entities":[]}),
("Please note that the information provided above is for informational purposes only and is subject to change. ",{"entities":[]}),
(" ",{"entities":[]}),
("The exact shareholdings of individual shareholders are confidential and not publicly disclosed. Instagram's ownership structure is governed by the agreements and regulations established during its acquisition by Facebook Inc. ",{"entities":[(212,225,"Organization")]}),
("Shareholdings ",{"entities":[]}),
(" ",{"entities":[]}),
("David Lee - David Lee, the Chief Financial Officer of Instagram, holds a significant share in the company. His financial expertise has played a vital role in ensuring the financial stability and growth of Instagram. ",{"entities":[(0,9,"Person"),(12,21,"Person"),(54,63,"Organization"),(205,214,"Organization")]}),
(" ",{"entities":[]}),
("John Smith - John Smith, the Chairman of the Board of Instagram, is also a major shareholder. With his extensive experience in the technology industry, he has contributed to the strategic guidance and success of Instagram. ",{"entities":[(13,23,"Person"),(0,10,"Person"),(54,63,"Organization")]}),
(" ",{"entities":[]}),
("Sarah Johnson - Sarah Johnson, the CEO of Instagram, holds a significant stake in the company. Her leadership and vision have been invaluable in shaping Instagram's direction and growth. ",{"entities":[(0,13,"Person"),(16,29,"Person"),(42,51,"Organization")]}),
(" ",{"entities":[]}),
("Facebook Inc. - As the parent company, Facebook Inc. holds 100% of the shares of Instagram. Facebook Inc. is a global technology company with a diverse portfolio of platforms and services. ",{"entities":[(0,13,"Organization"),(39,52,"Organization"),(81,90,"Organization"),(92,104,"Organization")]}),
(" ",{"entities":[]}),
("The major shareholders of Instagram are individuals and entities who have a significant ownership stake in the company. Their contributions and support have been instrumental in the growth and success of the platform. The following are some of our major shareholders: ",{"entities":[]}),
("Major Shareholders ",{"entities":[]}),
(" ",{"entities":[]}),
("Instagram is a private company and is currently wholly owned by Facebook Inc. Since its acquisition by Facebook in 2012, Instagram has operated as a separate entity but benefits from the resources and support of its parent company. ",{"entities":[(0,9,"Organization"),(64,77,"Organization"),(103,111,"Organization"),(121,130,"Organization")]}),
("Ownership Structure ",{"entities":[]}),
(" ",{"entities":[]}),
("This page provides information about the shareholders of Instagram. We believe in transparency and want to establish trust with our community by sharing details about our ownership structure and the individuals or entities that hold shares in our company. ",{"entities":[(57,66,"Organization")]}),
("Shareholders ",{"entities":[]}),
("Email: boardofdirectors@instagram.com Phone: +1-123-456-7890",{"entities":[(7,37,"Email Id"),(45,60,"Contact_Number")]}),
(" ",{"entities":[]}),
("For any inquiries or feedback regarding the Board of Directors, please contact us: ",{"entities":[]}),
("Contact Information ",{"entities":[]}),
(" ",{"entities":[]}),
("The Board of Directors remains committed to driving Instagram's growth, fostering innovation, and maintaining a vibrant and inclusive community for users worldwide. ",{"entities":[]}),
("Overseeing the expansion of Instagram's user base to over 1 billion monthly active users worldwide, making it one of the most popular social media platforms globally. ",{"entities":[(28,37,"Organization")]}),
("Supporting the launch of various initiatives to foster a safe and inclusive community, such as proactive content moderation practices and the promotion of positive online behavior. ",{"entities":[]}),
("Providing strategic guidance in the development and introduction of new features such as Stories, IGTV, and Reels, which have revolutionized content sharing and engagement on the platform. ",{"entities":[]}),
("Guiding Instagram through its acquisition by Facebook and ensuring the preservation of its unique identity and features. ",{"entities":[(8,17,"Organization"),(45,53,"Organization")]}),
(" ",{"entities":[]}),
("The Board of Directors has played a crucial role in Instagram's success and achievements over the years. Some of their notable contributions include: ",{"entities":[]}),
("Contributions and Achievements ",{"entities":[]}),
(" ",{"entities":[]}),
("Emily Davis is the Chief Marketing Officer at Instagram, overseeing all marketing and branding initiatives. With her innovative marketing strategies and deep understanding of consumer behavior, Emily has played a key role in expanding Instagram's user base and increasing brand awareness. ",{"entities":[(0,11,"Person"),(46,55,"Organization"),(194,199,"Person")]}),
("Emily Davis - Chief Marketing Officer ",{"entities":[(0,11,"Person")]}),
(" ",{"entities":[]}),
("David Lee brings extensive financial expertise to the board. As the CFO of Instagram, he is responsible for financial planning, analysis, and strategic investments. David's financial acumen has been instrumental in ensuring the company's financial stability and sustainable growth. ",{"entities":[(0,9,"Person"),(75,84,"Organization")]}),
("David Lee - Chief Financial Officer ",{"entities":[(0,9,"Person")]}),
(" ",{"entities":[]}),
("Sarah Johnson is the Chief Executive Officer of Instagram, responsible for driving the company's overall growth and success. With her strong leadership skills and deep understanding of social media platforms, Sarah has played a pivotal role in Instagram's evolution. ",{"entities":[(0,13,"Person"),(45,57,"Organization"),(209,214,"Person"),(244,253,"Organization")]}),
("Sarah Johnson - CEO, Instagram ",{"entities":[(0,13,"Person"),(21,30,"Organization")]}),
(" ",{"entities":[]}),
("John Smith is an accomplished business leader with over 20 years of experience in the technology industry. He has served as the Chairman of the Board since 2015 and has been instrumental in shaping Instagram's strategic vision. ",{"entities":[(0,10,"Person")]}),
("John Smith - Chairman of the Board ",{"entities":[(0,10,"Person")]}),
(" ",{"entities":[]}),
("The Board of Directors of Instagram is composed of the following members: ",{"entities":[]}),
("Board Composition ",{"entities":[]}),
(" ",{"entities":[]}),
("The Board of Directors of Instagram consists of highly experienced professionals who provide strategic guidance and oversight to ensure the success and growth of the company. The board members bring diverse expertise and perspectives to the table, contributing to the overall vision and direction of Instagram. ",{"entities":[(26,35,"Organization")]}),
("Board of Directors ",{"entities":[]}),
("Contact Information: For inquiries and to explore influencer collaboration opportunities, please reach out to our team at partnerships@instagram.com. ",{"entities":[(122,148,"Email Id")]}),
(" ",{"entities":[]}),
("Timeline: Each influencer collaboration campaign will be meticulously planned and executed to align with your brand objectives, with regular performance reviews and optimizations throughout the partnership. ",{"entities":[]}),
(" ",{"entities":[]}),
("Terms and Conditions: A comprehensive agreement will outline the terms of collaboration, including influencer selection criteria, content approvals, compensation details, and exclusivity clauses for mutual benefit and clarity. ",{"entities":[]}),
(" ",{"entities":[]}),
("Pricing: Our pricing model for influencer collaborations is transparent and based on factors such as influencer reach, engagement rates, content requirements, and campaign duration. ",{"entities":[]}),
(" ",{"entities":[]}),
("Proposed Solution: Our influencer collaboration services involve influencer discovery, outreach, negotiation, campaign management, and performance analysis to ensure successful partnerships that resonate with your brand values. ",{"entities":[]}),
(" ",{"entities":[]}),
("Problem Statement: Identifying and engaging the right influencers for brand partnerships can be a daunting task for many businesses, leading to missed opportunities for authentic promotion and audience engagement. ",{"entities":[]}),
(" ",{"entities":[]}),
("Company Overview: As a pioneer in visual content sharing, Instagram has established itself as a hub for influencer marketing collaborations, offering brands a platform to amplify their voice and connect with their target audience. ",{"entities":[(58,67,"Organization")]}),
(" ",{"entities":[]}),
("Executive Summary: Instagram presents a unique opportunity for brands to collaborate with influencers and leverage their reach and credibility to drive brand awareness and engagement. Our influencer collaboration services aim to connect brands with relevant influencers for impactful partnerships. ",{"entities":[]}),
("Business Proposal 2: Influencer Collaboration Services ",{"entities":[]}),
(" ",{"entities":[]}),
("Contact Information: For inquiries and further discussion on our social media marketing services, please contact our dedicated team at business@instagram.com. ",{"entities":[(135,157,"Email Id")]}),
(" ",{"entities":[]}),
("Timeline: The implementation of social media marketing services will be orchestrated in phases, with measurable goals and milestones outlined for each stage of the campaign. ",{"entities":[]}),
(" ",{"entities":[]}),
("Terms and Conditions: Terms of engagement include a detailed service agreement outlining deliverables, timelines, reporting mechanisms, and confidentiality clauses for a transparent partnership. ",{"entities":[]}),
(" ",{"entities":[]}),
("Pricing: Our pricing structure is flexible and customized based on the scope of services required. We offer competitive rates to cater to businesses of all sizes. ",{"entities":[]}),
(" ",{"entities":[]}),
("Proposed Solution: Our social media marketing services encompass strategic content creation, optimized ad campaigns, targeted audience analysis, and performance tracking to ensure impactful results for your brand. ",{"entities":[]}),
(" ",{"entities":[]}),
("Problem Statement: Many businesses struggle to effectively utilize social media for brand promotion and customer interaction, leading to missed opportunities for growth and visibility. ",{"entities":[]}),
(" ",{"entities":[]}),
("Company Overview: Instagram, a leader in social media platforms, leverages its expertise and innovative tools to deliver impactful marketing solutions. Our focus on visual content and community engagement sets us apart in the industry. ",{"entities":[(18,27,"Organization")]}),
(" ",{"entities":[]}),
("Executive Summary: Instagram offers comprehensive social media marketing services tailored to enhance brand presence and engagement. With a team of experienced digital marketers, we aim to elevate your social media strategy and maximize audience reach. ",{"entities":[(19,28,"Organization")]}),
("Business Proposal 1: Social Media Marketing Services ",{"entities":[]}),
("Category of Document: Business Proposal ",{"entities":[]}),
("Company Name: Instagram ",{"entities":[(14,23,"Organization")]}),]

In [2]:
with open('TRAIN_DATA.txt', 'w') as fp:
    for item in l:
        fp.write(f"{item}\n")

In [3]:
import csv
with open('TRAIN_DATA_NER.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(l)

In [4]:
for text, annot in l:
    print(text)
    print("Annot: ")
    print(annot)
    break

This Employee Payslip is issued to Emily Brown for the pay period from April 1, 2022 to April 15, 2022. It provides a detailed breakdown of earnings, deductions, and the net pay for the specified period.
Annot: 
{'entities': [(35, 46, 'Person')]}


In [21]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
nlp = spacy.blank("en")
db = DocBin() 
for text, annot in tqdm(l):
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    try:
        doc.ents = ents 
        db.add(doc)
    except:
        print(text, annot)
db.to_disk("train.spacy")

100%|██████████| 128/128 [00:00<00:00, 1661.97it/s]


In [1]:
import spacy

In [2]:
nlp1 = spacy.load(R"../output/model-best") #load the best model
doc = nlp1("""
Board of Directors
The Board of Directors of Instagram consists of highly experienced professionals who provide strategic guidance and oversight to ensure the success and growth of the company. The board members bring diverse expertise and perspectives to the table, contributing to the overall vision and direction of Instagram.

Board Composition
The Board of Directors of Instagram is composed of the following members:

John Smith - Chairman of the Board
John Smith is an accomplished business leader with over 20 years of experience in the technology industry. He has served as the Chairman of the Board since 2015 and has been instrumental in shaping Instagram's strategic vision.

Sarah Johnson - CEO, Instagram
Sarah Johnson is the Chief Executive Officer of Instagram, responsible for driving the company's overall growth and success. With her strong leadership skills and deep understanding of social media platforms, Sarah has played a pivotal role in Instagram's evolution.

David Lee - Chief Financial Officer
David Lee brings extensive financial expertise to the board. As the CFO of Instagram, he is responsible for financial planning, analysis, and strategic investments. David's financial acumen has been instrumental in ensuring the company's financial stability and sustainable growth.

Emily Davis - Chief Marketing Officer
Emily Davis is the Chief Marketing Officer at Instagram, overseeing all marketing and branding initiatives. With her innovative marketing strategies and deep understanding of consumer behavior, Emily has played a key role in expanding Instagram's user base and increasing brand awareness.

Contributions and Achievements
The Board of Directors has played a crucial role in Instagram's success and achievements over the years. Some of their notable contributions include:

Guiding Instagram through its acquisition by Facebook and ensuring the preservation of its unique identity and features.
Providing strategic guidance in the development and introduction of new features such as Stories, IGTV, and Reels, which have revolutionized content sharing and engagement on the platform.
Supporting the launch of various initiatives to foster a safe and inclusive community, such as proactive content moderation practices and the promotion of positive online behavior.
Overseeing the expansion of Instagram's user base to over 1 billion monthly active users worldwide, making it one of the most popular social media platforms globally.
The Board of Directors remains committed to driving Instagram's growth, fostering innovation, and maintaining a vibrant and inclusive community for users worldwide.

Contact Information
For any inquiries or feedback regarding the Board of Directors, please contact us:

Email: boardofdirectors@instagram.com Phone: +1-123-456-7890
""") # input sample text

spacy.displacy.render(doc, style="ent", jupyter=True)