In [None]:
from functools import partial
import pandas as pd
import re
import yaml


# POLITICAL VARIETY

In this notebook, every article is assigned the viewpoints represented in the article, represented by the political parties mentioned in the article.

To find out which political parties are mentioned, we use a manually defined regular expression that looks for party names, abbreviations or person names of politicians in the text, and maps those to the political party they represent.

## Input Data

* `item_metadata.csv` - Metadata of all articles that were recommended
    * item (int) - Item identifier
    * text (str) - Article text

   
* `political_parties.json` - JSON containing the party names, abbreviations and person names to look for and the political party they represent.

eg. 

```
{
    "Fratelli d'Italia": "Fratelli d'Italia",
    "Giorgia Meloni": "Fratelli d'Italia",
    "Fratelli d&rsquo;Italia": "Fratelli d'Italia",
    "DB": "Fratelli d'Italia"
}
```

## Output

This notebook will write 2 files to the `base_folder`:

* `item_metadata_w_tags.csv` - Item metadata augmented with a tag column containing a Topic ID.
    * item (int)
    * text (string)
    * tag (str)

In [None]:
base_folder = "data"

In [None]:
recommended_articles = pd.read_csv(f"{base_folder}/item_metadata.csv").fillna("")

In [None]:
mapping = yaml.safe_load(open(f"{base_folder}/political_parties.json", 'r'))

In [None]:
def check_regex(x):
    r =  re.search( r'\W(' + r'|'.join(mapping.keys()) + r')\W' , x.lower())
    if r:
        return r.groups()
    else:
        return None

In [None]:
# Find all the matched groups (potentially multiple per item)
recommended_articles["political"] = recommended_articles["text"].map(check_regex)

In [None]:
# Check amount of items that received a topic
(~recommended_articles["political"].isna()).sum()

In [None]:
recommended_articles.groupby('political').item.count()

In [None]:
# Map to the political entities identified as values in the mapping.
recommended_political_entities = recommended_articles.explode("political")
recommended_political_entities["mapped_political"] = recommended_political_entities["political"].map(lambda x: mapping.get(x))
recommended_political_entities["mapped_political"].unique()

In [None]:
item_metadata_w_tags = recommended_political_entities.dropna()[["item", "text", "mapped_political"]].rename(columns={"mapped_political": "tag"})

In [None]:
item_metadata_w_tags.to_csv(f"{base_folder}/item_metadata_w_tags.csv", index=False)