In [1]:
# import libraries
from datasets import load_dataset
from textblob import TextBlob
import os

In [2]:
# download dataset
ds = load_dataset("tner/ontonotes5")

In [3]:
entity2tag = {
    "O": 0,
    "B-CARDINAL": 1,
    "B-DATE": 2,
    "I-DATE": 3,
    "B-PERSON": 4,
    "I-PERSON": 5,
    "B-NORP": 6,
    "B-GPE": 7,
    "I-GPE": 8,
    "B-LAW": 9,
    "I-LAW": 10,
    "B-ORG": 11,
    "I-ORG": 12, 
    "B-PERCENT": 13,
    "I-PERCENT": 14, 
    "B-ORDINAL": 15, 
    "B-MONEY": 16, 
    "I-MONEY": 17, 
    "B-WORK_OF_ART": 18, 
    "I-WORK_OF_ART": 19, 
    "B-FAC": 20, 
    "B-TIME": 21, 
    "I-CARDINAL": 22, 
    "B-LOC": 23, 
    "B-QUANTITY": 24, 
    "I-QUANTITY": 25, 
    "I-NORP": 26, 
    "I-LOC": 27, 
    "B-PRODUCT": 28, 
    "I-TIME": 29, 
    "B-EVENT": 30,
    "I-EVENT": 31,
    "I-FAC": 32,
    "B-LANGUAGE": 33,
    "I-PRODUCT": 34,
    "I-ORDINAL": 35,
    "I-LANGUAGE": 36
}


In [4]:
df_train = ds["train"].to_pandas()
df_val = ds["validation"].to_pandas()
df_test = ds["test"].to_pandas()

print(df_train.head())

                                              tokens  \
0  [People, start, their, own, businesses, for, m...   
1  [But, a, chance, to, fill, out, sales, -, tax,...   
2  [Red, tape, is, the, bugaboo, of, small, busin...   
3  [Ironically, ,, the, person, who, wants, to, r...   
4  [Yet, every, business, owner, has, to, face, t...   

                                                tags  
0                        [0, 0, 0, 0, 0, 0, 0, 0, 0]  
1   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]  
2                        [0, 0, 0, 0, 0, 0, 0, 0, 0]  
3  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
4  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  


In [5]:
def analyze_sentiment(text):
    polarity = TextBlob(text).sentiment.polarity
    if polarity < 0:
        return 0  # Negativo
    elif polarity == 0:
        return 1  # Neutro
    else:
        return 2  # Positivo
        
def process_sentences(df):
    # Crear la columna 'sentence' uniendo los tokens
    df["sentence"] = df["tokens"].apply(lambda x: " ".join(x))

    # Aplicar análisis de sentimiento
    df["SA"] = df["sentence"].apply(analyze_sentiment)
    
    return df

# Aplicar a cada dataset
df_train = process_sentences(df_train)
df_val = process_sentences(df_val)
df_test = process_sentences(df_test)

In [6]:
df_train.head()

Unnamed: 0,tokens,tags,sentence,SA
0,"[People, start, their, own, businesses, for, m...","[0, 0, 0, 0, 0, 0, 0, 0, 0]",People start their own businesses for many rea...,2
1,"[But, a, chance, to, fill, out, sales, -, tax,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]",But a chance to fill out sales - tax records i...,2
2,"[Red, tape, is, the, bugaboo, of, small, busin...","[0, 0, 0, 0, 0, 0, 0, 0, 0]",Red tape is the bugaboo of small business .,0
3,"[Ironically, ,, the, person, who, wants, to, r...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","Ironically , the person who wants to run his o...",2
4,"[Yet, every, business, owner, has, to, face, t...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Yet every business owner has to face the mound...,2


In [7]:
df_train.tail(5)

Unnamed: 0,tokens,tags,sentence,SA
59919,"[Probably, .]","[0, 0]",Probably .,1
59920,"[Why, ?]","[0, 0]",Why ?,1
59921,"[I, do, n't, think, they, can]","[0, 0, 0, 0, 0, 0]",I do n't think they can,1
59922,"[but, I, think, they, 'll, try, .]","[0, 0, 0, 0, 0, 0, 0]",but I think they 'll try .,1
59923,"[You, know, how, people, are, .]","[0, 0, 0, 0, 0, 0]",You know how people are .,1


In [8]:
import spacy
import re

In [9]:
nlp = spacy.load("en_core_web_sm")

CONTRACTIONS = {
    "n't": "not", 
    "'ll": "will", 
    "'re": "are", 
    "'ve": "have", 
    "'m": "am", 
    "'d": "would", 
    "'s": "is", 
    "won't": "will not", 
    "can't": "cannot"
}
IRRELEVANT_WORDS = {"wow", "oops", "ah", "ugh", "yay", "mhm", "`"}

def replace_contractions(text):
    for contraction, replacement in CONTRACTIONS.items():
        text = re.sub(r"\b" + re.escape(contraction) + r"\b", replacement, text)
    return text

def process_tokens(text):
    text = replace_contractions(text)
    text = text.replace("-", "")
    # Crear un objeto Doc de spaCy para cada palabra
    doc = nlp(text)  # Unir las palabras en una cadena y procesarlas con spaCy
    
    # Filtrar tokens: eliminar puntuación, stopwords y lematizar
    processed_words = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct and token.text.lower() not in IRRELEVANT_WORDS and not token.is_digit]
    
    return processed_words

df_train["tokens"] = df_train["sentence"].apply(process_tokens)

In [10]:
df_train.head(15)

Unnamed: 0,tokens,tags,sentence,SA
0,"[People, start, business, reason]","[0, 0, 0, 0, 0, 0, 0, 0, 0]",People start their own businesses for many rea...,2
1,"[chance, fill, sale, , tax, record, rarely]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]",But a chance to fill out sales - tax records i...,2
2,"[red, tape, bugaboo, small, business]","[0, 0, 0, 0, 0, 0, 0, 0, 0]",Red tape is the bugaboo of small business .,0
3,"[ironically, person, want, run, business, prob...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","Ironically , the person who wants to run his o...",2
4,"[business, owner, face, mound, form, regulatio...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Yet every business owner has to face the mound...,2
5,"[hope, change]","[0, 0, 0, 0, 0, 0]",There is hope of change .,1
6,"[week, Sen., Malcolm, Wallop, LRB, R., Wyo, rr...","[2, 3, 0, 0, 4, 5, 0, 6, 0, 7, 8, 0, 0, 0, 0, ...","Last week , Sen. Malcolm Wallop -LRB- R. , Wyo...",0
7,"[great, federal, regulation, mean, large, enti...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",`` A great many federal regulations are meant ...,2
8,"[lawmaker, busy, try, revive, recently, lapse,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 10, 10, 0, 0...",Other lawmakers are busy trying to revive the ...,2
9,"[optimistic, entrepreneur, await, promise, lan...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","Thus , optimistic entrepreneurs await a promis...",0


In [11]:
df_train["sentence"].iloc[6]

'Last week , Sen. Malcolm Wallop -LRB- R. , Wyo . -RRB- held hearings on a bill to strengthen an existing law designed to reduce regulatory hassles for small businesses .'

In [12]:
# df_train.tail(15)

In [13]:
# save data in .csv
if not os.path.exists("data"):
    os.makedirs("data")

df_train.to_csv(f"data/df_train.csv", index=False)
df_val.to_csv(f"data/df_val.csv", index=False)
df_test.to_csv(f"data/df_test.csv", index=False)