In [17]:
import pandas as pd

In [18]:
from framefinder import framedimensions

dimensions = [
    "Care: ...acted with kindness, compassion, or empathy, or nurtured another person.",
    "Harm: ...acted with cruelty, or hurt or harmed another person/animal and caused suffering.",
    "Fairness: ...acted in a fair manner, promoting equality, justice, or rights.",
    "Cheating: ...was unfair or cheated, or caused an injustice or engaged in fraud.",
    "Loyalty: ...acted with fidelity, or as a team player, or was loyal or patriotic.",
    "Betrayal: ...acted disloyal, betrayed someone, was disloyal, or was a traitor.",
    "Authority: ...obeyed, or acted with respect for authority or tradition.",
    "Subversion: ...disobeyed or showed disrespect, or engaged in subversion or caused chaos.",
    "Sanctity: ...acted in a way that was wholesome or sacred, or displayed purity or sanctity.",
    "Degredation: ...was depraved, degrading, impure, or unnatural.",
]
pole_names = [
    ("Care", "Harm"),
    ("Fairness", "Cheating"),
    ("Loyalty", "Betrayal"),
    ("Authority", "Subversion"),
    ("Sanctity", "Degredation"),
]
base_model = "all-mpnet-base-v2"
framing_dimensions = framedimensions.FramingDimensions(
    base_model, dimensions, pole_names
)

In [32]:
sentences = [
    "sylla-\nbles",
"Does capitalization change the framing?",
"Does \"citation\" \'change\' the framing?",
"co-sponsored",
"state-of-the-art",
"The reforms needed to address market barriers to renewables include:",
"� Streamlined and uniform planning procedures and permitting systems and integrated least cost network planning.",
"� Fair access to the grid at fair, transparent prices and removal of discriminatory access and transmission tariffs.",
"� Fair and transparent pricing for power throughout a network, with recognition and remuneration for the benefits of embedded generation.",
"� Unbundling of utilities into separate generation and distribution companies.",
"� The costs of grid infrastructure development and reinforcement must be carried by the grid management authority rather than individual renewable energy projects.",
"� Disclosure of fuel mix and environmental impact to end users to enable consumers to make an informed choice of power source.",
"Soaring above its nest on New York's Fifth Avenue, this red-tailed hawk signals a resurgence of many raptors nationwide, including bald eagles and ospreys.",
"• Funding available to forested developing countries that is sufficient, sustainable and predictable enough to incentivise serious, sustained commitment to REDD action by developing.",
"- build capacity and develop adequate strategies;",
"- pay developing countries for verified forest emissions reductions.",
"The funding requirement is estimated variously at between US$15 and 40 billion per annum, with in the order of 90% used for payments for verified forest emissions reductions.",
"• Safe guards against perverse carbon outcomes, including protecting peat soils.",
]
sentences_without_symbols = [
    "sylla-\nbles",
  "DOES CAPITALIZATION CHANGE THE FRAMING?",
"Does citation change the framing?",
"co sponsored",
"state of the art",
"The reforms needed to address market barriers to renewables include",
"Streamlined and uniform planning procedures and permitting systems and integrated least cost network planning.",
"Fair access to the grid at fair, transparent prices and removal of discriminatory access and transmission tariffs.",
"Fair and transparent pricing for power throughout a network, with recognition and remuneration for the benefits of embedded generation.",
"Unbundling of utilities into separate generation and distribution companies.",
"The costs of grid infrastructure development and reinforcement must be carried by the grid management authority rather than individual renewable energy projects.",
"Disclosure of fuel mix and environmental impact to end users to enable consumers to make an informed choice of power source.",
"Soaring above its nest on New York's Fifth Avenue, this red-tailed hawk signals a resurgence of many raptors nationwide, including bald eagles and ospreys.",
"Funding available to forested developing countries that is sufficient, sustainable and predictable enough to incentivise serious, sustained commitment to REDD action by developing.",
"build capacity and develop adequate strategies;",
"pay developing countries for verified forest emissions reductions.",
"The funding requirement is estimated variously at between US15 and 40 billion per annum, with in the order of 90 percent used for payments for verified forest emissions reductions.",
"Safe guards against perverse carbon outcomes, including protecting peat soils.",
]

In [28]:
import re


def filter_special_characters(texts, remove_newlines=True):
    if type(texts) == str:
        texts = [texts]
    filtered_texts = []
    for text in texts:
        # (A|C|B|D)\.( )?((1|2|3|4|5|6|7|8|9)(\.(1|2|3|4|5|6|7|8|9))?)? for B.3.2
        # remove lines without letters in them
        text = re.sub(r"^[^a-zA-Z]*$", "", text)
        # remove numbers followed by a dot if at the beginning of a line e.g. or "1. sentence\n1.1. sentence\n1.1.1. sentence"
        text = re.sub(r"^\d+(\.\d*)*", "", text)
        # remove quotes such as "
        text = re.sub(r"\"", "", text)
        # remove non-ascii characters, purposely replaced with space since sometimes situations like "word1�word2" occur
        text = re.sub(r"�", " ", text)  
        # fuse split words in multiple lines e.g. "syllab-\nle"
        text = re.sub(r"-\n", "", text)  
        # remove references and links e.g. {3.6, 10.3}, {see Box3} or {FAQ 9.2, Figure 1}
        text = re.sub(r"\{.*?\}", "", text)

        # remove enumerations
        # purposely remove the space after the hyphen to prevent splitting of words like "co-sponsored"
        text = re.sub(r"\*|•|■|▪|- |❖|►|»»|>>||<<|□□", "", text)
        if remove_newlines:
            # fuses titles, headers, footers, etc. to as single sentence => better normalization
            # also prevent single sentences from being split to multiple ones by newlines
            text = re.sub(r"\n|\t", " ", text)
        filtered_texts.append(text)

        # ? should %, $, £, € be removed?
    return filtered_texts


In [33]:
print(filter_special_characters(sentences))

['syllables', 'Does capitalization change the framing?', "Does citation 'change' the framing?", 'co-sponsored', 'state-of-the-art', 'The reforms needed to address market barriers to renewables include:', '  Streamlined and uniform planning procedures and permitting systems and integrated least cost network planning.', '  Fair access to the grid at fair, transparent prices and removal of discriminatory access and transmission tariffs.', '  Fair and transparent pricing for power throughout a network, with recognition and remuneration for the benefits of embedded generation.', '  Unbundling of utilities into separate generation and distribution companies.', '  The costs of grid infrastructure development and reinforcement must be carried by the grid management authority rather than individual renewable energy projects.', '  Disclosure of fuel mix and environmental impact to end users to enable consumers to make an informed choice of power source.', "\x0cSoaring above its nest on New York'

In [18]:
df = pd.DataFrame(framing_dimensions(sentences))
df.to_csv("with_symbols.csv", index=False)
df = pd.DataFrame(framing_dimensions(sentences_without_symbols))
df.to_csv("without_symbols.csv", index=False)