In [1]:
import os
import pandas as pd
import regex as re
import wikipediaapi
import requests
import nltk
# nltk.download('punkt') # needed to download punkt once

In [2]:
from framefinder import framedimensions
base_model = "all-mpnet-base-v2"
dimensions = [
    "Care: ...acted with kindness, compassion, or empathy, or nurtured another person.",
    "Harm: ...acted with cruelty, or hurt or harmed another person/animal and caused suffering.",
    "Fairness: ...acted in a fair manner, promoting equality, justice, or rights.",
    "Cheating: ...was unfair or cheated, or caused an injustice or engaged in fraud.",
    "Loyalty: ...acted with fidelity, or as a team player, or was loyal or patriotic.",
    "Betrayal: ...acted disloyal, betrayed someone, was disloyal, or was a traitor.",
    "Authority: ...obeyed, or acted with respect for authority or tradition.",
    "Subversion: ...disobeyed or showed disrespect, or engaged in subversion or caused chaos.",
    "Sanctity: ...acted in a way that was wholesome or sacred, or displayed purity or sanctity.",
    "Degredation: ...was depraved, degrading, impure, or unnatural.",
]
pole_names = [
    ("Care", "Harm"),
    ("Fairness", "Cheating"),
    ("Loyalty", "Betrayal"),
    ("Authority", "Subversion"),
    ("Sanctity", "Degredation"),
]
framing_dimensions = framedimensions.FramingDimensions(
    base_model, dimensions, pole_names
)

In [None]:
from framefinder import framelabels
base_model = "all-mpnet-base-v2"
candidate_labels = [
    "Economic: costs, benefits, or other financial implications",
    "Capacity and resources: availability of physical, human or financial resources, and capacity of current systems",
    "Morality: religious or ethical implications",
    "Fairness and equality: balance or distribution of rights, responsibilities, and resources",
    "Legality, constitutionality and jurisprudence: rights, freedoms, and authority of individuals, corporations, and government",
    "Policy prescription and evaluation: discussion of specific policies aimed at addressing problems",
    "Crime and punishment: effectiveness and implications of laws and their enforcement",
    "Security and defense: threats to welfare of the individual, community, or nation",
    "Health and safety: health care, sanitation, public safety",
    "Quality of life: threats and opportunities for the individual’s wealth, happiness, and well-being",
    "Cultural identity: traditions, customs, or values of a social group in relation to a policy issue",
    "Public opinion: attitudes and opinions of the general public, including polling and demographics",
    "Political: considerations related to politics and politicians, including lobbying, elections, and attempts to sway voters",
    "External regulation and reputation: international reputation or foreign policy of the U.S.",
    "Other: any coherent group of frames not covered by the above categories",
]

framing_labels = framelabels.FramingLabels("facebook/bart-large-mnli", candidate_labels)

In [3]:
def clean_string(
    text, remove_headers=True, remove_enumerations=True
):  # ? might not work for all articles, need to test
    text = re.split(r"==See also==", text)[0]  # remove everything after "See also"
    text = re.sub(
        r"\{\{Asof\|(\d{4})\|(\d{1,2})\}\}", r"as of \2/\1", text
    )  # replace {{Asof|YYYY|MM}} with as of MM/YYYY
    text = re.sub(r"\{\{.*}}", "", text)  # remove {{}}
    text = re.sub(r"&lt;!--.*?-->", "", text)  # remove <!--...-->
    text = re.sub(r"&lt.*--", "", text)  # remove &lt;!--
    text = re.sub(r"-->", "", text)  # remove -->
    if remove_enumerations:
        text = re.sub(r"\n\*.*", "\n", text)
    else:
        text = re.sub(r"\n\* ", "\n", text)  # remove enumeration symbol *
    text = re.sub(r"\n#", "\n", text)  # remove enumeration symbol #
    text = re.sub(r"&lt;ref.*?&lt;/ref>", "", text)  # remove <ref>...</ref>
    text = re.sub(r"&lt;ref.*/>", "", text)  # remove <ref ... />
    if remove_headers:
        text = re.sub(
            r"==+.*==+\n", "", text
        )  # remove lines containing ==, ===, ====, ...
    else:
        text = re.sub(r"==+", "", text)  # remove ==, ===, ====, ...
    text = re.sub(r"'+", "", text)  # remove ''', '''', ...
    text = re.sub(r"\xa0", " ", text)  # replace non-breaking space with space
    text = re.sub(
        r"\[\[File:(?:\[\[[^\]]*?\]\]|.)*?\]\]", "", text
    )  # remove [[File:...]]
    text = re.sub(
        r"\[\[[^\|\]]*\|([^\]]+)\]\]", r"[[\1]]", text
    )  # replace [[left|right]] with [right]
    text = text.replace(r"[[", "").replace("]]", "")  # remove [[ and ]]
    text = re.sub(
        r"\{\|(?:(?:\{\|(?:(?:\{\|(?:[^{}])*\|\})|(?:[^{}]))*\|\})|(?:[^{}]))*\|\}",
        "",
        text,
    )  # replace nested {| * |}
    text = re.sub(r"\{\{(?:\n|.)*?\}\}", "", text)  # replace {{ * }}
    text = re.sub(r"mini\|.*\|", "", text)
    text = re.sub(r"mini\|", "", text)
    text = re.sub(r":\* .*ISBN.*", "", text)  # remove reverence books
    text = re.sub(r"Kategorie:.*", "", text)  # remove reverence books
    text = re.sub(r"\n+", "\n", text)  # replace multiple newlines with one
    return text


def extract_source(text):
    text = re.split(r"<textarea[^>]*>", text)[1]
    return re.split(r"</textarea>", text)[0]



def tokenize_articles(articles):
    tokenized_articles = []
    for article in articles:
        tokenized_articles.append(nltk.sent_tokenize(article))
    return tokenized_articles



In [15]:
def fetch_article_api(article_title, language="en"):
    wiki = wikipediaapi.Wikipedia(
        "FramingAnalysis (riedl.manuel.privat@gmail.com)",
        language,
        extract_format=wikipediaapi.ExtractFormat.WIKI,
    )
    page = wiki.page(article_title)
    article = page.text
    if language == "en":
        article = re.split(r"\nSee also\n", article)[0]
        article = re.split(r"\nReferences\n", article)[0]
        article = re.split(r"\nSignificant publications\n", article)[0]
        article = re.split(r"\nPublications\n", article)[0]
        article = re.split(r"\nPublications\n", article)[0]
        article = re.split(r"\n== References ==", article)[0]

    return article


def fetch_wiki_articles_http(
    article_title, language="en"
): 
    response = requests.get(
        f"https://{language}.wikipedia.org/w/index.php?title={article_title}&action=edit"
    )
    text = extract_source(response.text)
    return clean_string(text)

def fetch_article_locally(article_title):
    with open("articles/" + article_title + ".txt", "r", encoding="utf8") as file:
        return file.read()

def fetch_articles(article_titles, language="en", fetch_method="api"):
    articles = []
    for article_title in article_titles:
        print(article_title)
        if fetch_method == "api":
            article = fetch_article_api(article_title, language)
        elif fetch_method == "http":
            article = fetch_wiki_articles_http(article_title, language)
        elif fetch_method == "local":
            article = fetch_article_locally(article_title)
        else:
            raise ValueError("Invalid fetch method")
        
        if not article:
            raise ValueError(f"Article '{article_title}' not found")
        articles.append(article)
   
    return articles


def save_articles_locally(articles, file_names, path="articles/"):
    os.makedirs(path, exist_ok=True)
    for i in range(len(articles)):
        with open(path + file_names[i] + ".txt", "w", encoding="utf8") as file:
            file.write(articles[i])


In [9]:
def frame_dimensions_from_article(articles, article_titles, path="dumps/"):
    os.makedirs(path, exist_ok=True)
    dfs = []
    for i in range(len(articles)):
        print(article_titles[i])
        dimensions_df = pd.DataFrame(framing_dimensions(articles[i]))
        dimensions_df.to_csv(path + article_titles[i] + ".csv", index=False)
    dfs.append(dimensions_df)
    return dfs

def frame_labels_from_article(articles, article_titles, path="dumps/"):
    os.makedirs(path, exist_ok=True)
    dfs = []
    for i in range(len(articles)):
        labels_df = pd.DataFrame(framing_labels(articles[i]))
        labels_df.to_csv(path + article_titles[i] + ".csv", index=False)
    dfs.append(labels_df)
    return dfs

In [26]:
path = "./article_titles_by_category/"
article_titles = []
for file in os.listdir(path):
    with open(path + file, "r") as f:
        article_titles.extend(f.read().splitlines())

In [27]:
#articles_http = fetch_articles(article_titles, fetch_method="http")
#save_articles_locally(articles_http, article_titles, path="articles_http/")

#articles_api = fetch_articles(article_titles, fetch_method="api")
#save_articles_locally(articles_api, article_titles)

articles = fetch_articles(article_titles, fetch_method="local")

Causes_of_climate_change
Climate_change
Climate_crisis
2_degree_climate_target
2019_in_climate_change
2020_in_climate_change
2020s_in_environmental_history
2021_in_climate_change
2022_in_climate_change
2023_in_climate_change
Abrupt_climate_change
Assisted_migration
Climate_Action_Africa
Climate_change_adaptation
Climate_change_and_cities
Climate_change_litigation
Climate_change,_food_security,_and_migration
Climate_inertia
Climate_Information_Service
Climate_psychology
Climate_security
Climate_spiral
Coastal_flooding
Deforestation_and_climate_change
Dry_Corridor
Early_anthropocene
Economic_analysis_of_climate_change
Climate_emergency_declaration
Climate_engineering
Flight_shame
General_circulation_model
Global_cooling
Global_dimming
Global_surface_temperature
Global_terrestrial_stilling
Global_warming_hiatus
Greenhouse_gas_emissions
Greenland_ice_core_project
History_of_climate_change_science
Hyperthermal_event
Infrared_window
Marsh_organ
Ocean_heat_content
Oslo_Principles
Ozone_deplet

In [28]:
articles = tokenize_articles(articles)
dimensions_dfs = frame_dimensions_from_article(articles, article_titles, path="dumps/dimensions/")
#labels_dfs = frame_labels_from_article(articles, article_titles, path="dumps/labels/")


Causes_of_climate_change
Climate_change
Climate_crisis
2_degree_climate_target
2019_in_climate_change
2020_in_climate_change
2020s_in_environmental_history
2021_in_climate_change
2022_in_climate_change
2023_in_climate_change
Abrupt_climate_change
Assisted_migration
Climate_Action_Africa
Climate_change_adaptation
Climate_change_and_cities
Climate_change_litigation
Climate_change,_food_security,_and_migration
Climate_inertia
Climate_Information_Service
Climate_psychology
Climate_security
Climate_spiral
Coastal_flooding
Deforestation_and_climate_change
Dry_Corridor
Early_anthropocene
Economic_analysis_of_climate_change
Climate_emergency_declaration
Climate_engineering
Flight_shame
General_circulation_model
Global_cooling
Global_dimming
Global_surface_temperature
Global_terrestrial_stilling
Global_warming_hiatus
Greenhouse_gas_emissions
Greenland_ice_core_project
History_of_climate_change_science
Hyperthermal_event
Infrared_window
Marsh_organ
Ocean_heat_content
Oslo_Principles
Ozone_deplet