In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt

In [2]:
data1 = pd.read_csv('Data/SEntFiN-v1.1.csv') # https://www.kaggle.com/datasets/ankurzing/aspect-based-sentiment-analysis-for-financial-news ",
data2 = pd.read_csv('Data/sent_train.csv') # https://www.kaggle.com/datasets/borhanitrash/twitter-financial-news-sentiment-dataset",
data3 = pd.read_csv('Data/data.csv') # https://www.kaggle.com/datasets/sbhatti/financial-sentiment-analysis",
data4 = pd.read_csv('Data/all-data.csv', names=["sentiment", "text"], encoding="utf-8", encoding_errors="replace") # https://www.kaggle.com/datasets/ankurzing/sentiment-analysis-for-financial-news1",
display(data1.head(4))
display(data2.head(4)) # In first rows it seemed like in each sentence contains the symbol, on further inspection it seems like the symbol is not always present.1",
display(data3.head(4))
display(data4.head(4))

Unnamed: 0,S No.,Title,Decisions,Words
0,1,SpiceJet to issue 6.4 crore warrants to promoters,"{""SpiceJet"": ""neutral""}",8
1,2,MMTC Q2 net loss at Rs 10.4 crore,"{""MMTC"": ""neutral""}",8
2,3,"Mid-cap funds can deliver more, stay put: Experts","{""Mid-cap funds"": ""positive""}",8
3,4,Mid caps now turn into market darlings,"{""Mid caps"": ""positive""}",7


Unnamed: 0,text,label
0,$BYND - JPMorgan reels in expectations on Beyo...,0
1,$CCL $RCL - Nomura points to bookings weakness...,0
2,"$CX - Cemex cut at Credit Suisse, J.P. Morgan ...",0
3,$ESS: BTIG Research cuts to Neutral https://t....,0


Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral


Unnamed: 0,sentiment,text
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...


In [3]:
import pycountry

COMMODITY = {
    "gold", "silver", "platinum", "palladium", "copper", "aluminum", "nickel",
    "zinc", "lead", "tin", "iron ore", "steel", "cobalt", "uranium", "crude oil",
    "oil", "brent", "wti", "gasoline", "diesel", "jet fuel", "natural gas",
    "heating oil", "coal", "propane", "fuel oil", "wheat", "corn", "maize",
    "soybeans", "rapeseed", "canola", "rice", "barley", "oats", "sorghum",
    "coffee", "cocoa", "cotton", "sugar", "orange juice", "palm oil", "rubber",
    "tea", "live cattle", "lean hogs", "milk", "lumber"
}

GPE = {c.name.lower() for c in pycountry.countries}
GPE.update({"usa", "u.s.", "us", "u.k.", "uk", "europe"})

CURRENCY = {
    "dollar", "dollars", "euro", "euros", "yen", "yens", "yuan", "pound",
    "pounds", "rupee", "rupees", "franc", "francs"
}

CONCEPT = {
    "market", "markets", "stocks", "world stocks", "mid caps", "mid-cap funds",
    "equity", "equities", "bonds", "futures", "options", "ipo", "ncds", "qip",
    "gift", "experts", "investors", "promoters", "analysts", "traders",
    "farmer bodies", "indian millers", "fii", "fiis", "nifty", "sensex", "dow jones"
}

KEYWORD_RULES = {
    "COMMODITY": COMMODITY,
    "GPE": GPE,
    "CURRENCY": CURRENCY,
    "CONCEPT": CONCEPT
}

# Heuristic function to identify people's names
def is_likely_person(name: str) -> bool:
    parts = name.strip().split()
    if len(parts) != 2: return False
    if parts[0].isupper() and parts[1].isupper(): return False
    return parts[0].istitle() and parts[1].istitle()

# Main function to determine the label for any given name
def get_entity_label(name: str) -> str:
    name_lower = name.lower().strip()
    for label, keywords in KEYWORD_RULES.items():
        if name_lower in keywords:
            return label
    if is_likely_person(name):
        return "PERSON"
    return "COMPANY" # Fallback label

In [4]:
import spacy
from spacy.tokens import DocBin
from spacy.matcher import Matcher
from spacy.util import filter_spans
import pandas as pd
from sklearn.model_selection import train_test_split

nlp = spacy.load("en_core_web_sm")
TRAIN_DATA = []

for _, row in data1.iterrows():
    title = row['Title']
    decision_dict = eval(row['Decisions'])
    
    entities_to_find = {name: get_entity_label(name) for name in decision_dict.keys()}
    
    entities_for_this_title = []
    with nlp.select_pipes(disable="ner"):
        doc = nlp(title)

    for name, label in entities_to_find.items():
        matcher = Matcher(nlp.vocab)
        # Using .strip() and LOWER for robust matching
        pattern = [{"LOWER": token.lower_} for token in nlp(name.strip())]
        matcher.add("ENTITY_PATTERN", [pattern])
        matches = matcher(doc)
        
        for match_id, start_token, end_token in matches:
            span = doc[start_token:end_token]
            entity = (span.start_char, span.end_char, label)
            entities_for_this_title.append(entity)

    if entities_for_this_title:
        spans = [doc.char_span(s, e, label=l) for s, e, l in set(entities_for_this_title) if doc.char_span(s, e, label=l) is not None]
        filtered_spans = filter_spans(spans)
        final_entities = [(span.start_char, span.end_char, span.label_) for span in filtered_spans]
        TRAIN_DATA.append((title, {"entities": final_entities}))
train_data, dev_data = train_test_split(TRAIN_DATA, test_size=0.2, random_state=42)

def create_spacy_files(data, output_path):
    db = DocBin()
    for text, annot in data:
        doc = nlp.make_doc(text)
        doc.ents = [doc.char_span(s, e, label=l, alignment_mode="contract") for s, e, l in annot["entities"]]
        db.add(doc)
    db.to_disk(output_path)


create_spacy_files(train_data, "./Data/train.spacy")
create_spacy_files(dev_data, "./Data/dev.spacy")
print(f"Total examples: {len(TRAIN_DATA)}, Training: {len(train_data)}, Dev: {len(dev_data)}")

  from .autonotebook import tqdm as notebook_tqdm


Total examples: 10687, Training: 8549, Dev: 2138


'python -m spacy train config_trf.cfg --output ./output_trf --gpu-id 0'

In [5]:
import random

model_path = "./output_trf/model-best"
nlp_trained = spacy.load(model_path)

dev_file = "./Data/dev.spacy"
doc_bin = DocBin().from_disk(dev_file)
validation_data = []
for doc in doc_bin.get_docs(nlp.vocab):
    entities = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
    validation_data.append((doc.text, {"entities": entities}))

mistakes = []
for text, annotations in validation_data:
    doc_predicted = nlp_trained(text)
    true_entities = set(annotations['entities'])
    predicted_entities = set((ent.start_char, ent.end_char, ent.label_) for ent in doc_predicted.ents)
    
    if true_entities != predicted_entities:
        mistakes.append({
            "text": text,
            "true_entities": true_entities,
            "predicted_entities": predicted_entities
        })

print(f"Found {len(mistakes)} sentences with prediction errors out of {len(validation_data)}.")

print("\n--- Reviewing 5 Random Mistakes ---")
if mistakes:
    for item in random.sample(mistakes, min(5, len(mistakes))):
        print("\nTEXT:", item['text'])
        print(f"TRUE: {item['true_entities']}")
        print(f"PREDICTED: {item['predicted_entities']}")
else:
    print("No mistakes found on the dev set!")

Found 98 sentences with prediction errors out of 2138.

--- Reviewing 5 Random Mistakes ---

TEXT: Federal Bank and Karnataka Bank safe bets in PSU banks: Mehraboon Irani
TRUE: {(17, 31, 'PERSON'), (0, 12, 'PERSON')}
PREDICTED: {(45, 54, 'COMPANY'), (17, 31, 'PERSON'), (0, 12, 'PERSON')}

TEXT: Won leads Asia FX slide as volatile China stocks, Greece hit sentiment
TRUE: {(0, 3, 'COMPANY'), (10, 17, 'COMPANY')}
PREDICTED: {(0, 3, 'CURRENCY'), (10, 17, 'COMPANY')}

TEXT: With auto sales gaining pace, brokers say auto ancillaries can give over 20% returns
TRUE: {(42, 58, 'COMPANY')}
PREDICTED: {(5, 15, 'COMPANY'), (42, 58, 'COMPANY')}

TEXT: Chinese corporate bond market calm despite default by Chaori Solar
TRUE: {(54, 66, 'PERSON')}
PREDICTED: {(0, 29, 'COMPANY'), (54, 66, 'PERSON')}

TEXT: Euro/USD looking at support between $1.3815-30
TRUE: {(0, 8, 'COMPANY')}
PREDICTED: {(0, 4, 'CURRENCY')}
