In [39]:
import re
import json
import pickle
from pathlib import Path
from collections import OrderedDict, Counter

In [30]:
INPUT_FILE = "../../data/flairs/flairs_processed.jsonl"
OUTPUT_FILE = "../../data/flairs/flars_ready.json"
TEMP_OUTPUT_FILE = "../../data/temp/flars_ready.json"
RAW_FLAIR_FILE = "../../data/flairs/flairs_raw.pkl"

REGION_REGEX_PATTERN = r"((?<=\()(\w+)(?=\)))|((?<=(,\s))(\w+))"
ENTRY_TEMPLATE_KEYS = ("region_original", "region_extended", "region_country")

CRIM Crimea, MORO Northern Ireland

In [19]:
def read_temp_or_initialize_dict(temp_file):
    """Attempts to read already annotated temporary data file if exists,
       else return empty dict to be populated
    """
    if temp_file.exists() and temp_file.is_file():
        try:
            flairs = json.load(temp_file.open("r"))
            print("Existing annotations loaded from {}:".format(TEMP_OUTPUT_FILE))
            print(list(flairs.keys()))
        except json.JSONDecodeError as e:
            raise    
    else:
        flairs = OrderedDict()
    
    return flairs
        

def resolve_region_extended(region_original, input_country):
    country_match = re.search(REGION_REGEX_PATTERN, region_original)
    
    if input_country.strip():
        # User prompt not empty:
        if country_match:
            # Country is already mentioned in `region_original` but so in the user prompt, 
            # meaning it's needed to be corrected with the prompt input
            # e.g. "Moravia (Czechia)" -> "Moravia (Czech Republic)"
            if "," in region_original:
                # Region and country are seperated by a comma
                region, _ = re.split(r",\s*", region_original)
            else:
                # Region and country are seperated by a whitespace, country is in parantheses
                region, _ = re.split(r"\s*\(", region_original)
    
        else:
            # Country is not mentioned in `region_original`, 
            # meaning only region is mentioned in `region_original`
            region = region_original
        
        region_extended = "{} ({})".format(region, input_country)
                
    else:
        # User prompt is empty
        if country_match:
            # All relevant information is already in the `region_original`
            if "," in region_original:
                # Region and country are seperated by a comma 
                # and just needs to be reformatted in the correct format
                # e.g. "Veneto, Italy" -> "Veneto (Italy)"
                region, country = re.split(r",\s*", region_original)
                region_extended = "{} ({})".format(region, country)
            else:
                # Region and country are seperated by a whitespace, country is in parantheses,
                # therefore in the correct format already
                region_extended = region_original
        else:
            # `region_original` only contains country information
            # and therefore in acceptable format already
            region_extended = region_original
            
    return region_extended

In [20]:
assert resolve_region_extended("Poltava (Ukraine)", "") == "Poltava (Ukraine)"
assert resolve_region_extended("Moravia (Czechia)", "Czech Republic") == "Moravia (Czech Republic)"
assert resolve_region_extended("Veneto, Italy","") == "Veneto (Italy)"
assert resolve_region_extended("Lithuania","") == "Lithuania"
assert resolve_region_extended("Åland","Finland") == "Åland (Finland)"

In [26]:
# Reorganizing input .jsonl file to JSON format with original flair ids as dictionary entry keys
flairs = read_temp_or_initialize_dict(temp_file=Path(TEMP_OUTPUT_FILE))

with open(INPUT_FILE, "r", encoding="utf8") as fp:
    for line in fp:
        dict_ = json.loads(line)
        flair_id = dict_["key"]
        region_original = dict_["value"]
        
        # Skip unknown entries and save them to a seperate list
        if not region_original or flair_id in flairs.keys():
            continue
        
        try:
            # Prompting user for country based on the original flair text
            input_country = input("Country of {}: {} ".format(flair_id, region_original))
        except KeyboardInterrupt:
            # If kernel is interrupted, save the existing annotated data to a temp file
            temp_file = Path(TEMP_OUTPUT_FILE)
            temp_file.parent.mkdir(exist_ok=True)
            with temp_file.open("w", encoding="utf8") as fp:
                json.dump(flairs, fp, ensure_ascii=False, indent=4)
            print("Annotated data saved to ", TEMP_OUTPUT_FILE)
            raise
        else:
            # Populating output dictionary values with another empty dict with flair ids as key
            flairs[flair_id] = dict.fromkeys(ENTRY_TEMPLATE_KEYS, None)
            
            region_extended = resolve_region_extended(region_original, input_country)
            country_match = re.search(REGION_REGEX_PATTERN, region_extended)
            region_country = country_match.group(0) if country_match else region_extended

            # Populating the resulting empty dict with processed  values
            entry_values = (region_original, region_extended, region_country)
            for key, value in zip(ENTRY_TEMPLATE_KEYS, entry_values):
                flairs[flair_id][key] = value

            print(flairs[flair_id]) # Printing the output to visualize the resulting entry

Existing annotations loaded from ../../data/temp/flars_ready.json:
dict_keys(['AALA', 'ABRZ', 'ADYG', 'ALBA', 'ALGE', 'AMST', 'APUL', 'ARGE', 'ARME', 'ASTR', 'AT-1', 'AT-2', 'AT-3', 'AT-4', 'AT-5', 'AT-6', 'AT-7', 'AT-8', 'AT-9', 'AUST', 'AZER', 'BANG', 'BELA', 'BELG', 'BOLI', 'BORN', 'BOSN', 'BOUV', 'BRAZ', 'BRUX', 'BUCU', 'BULG', 'CALA', 'CANA', 'CH-AG', 'CH-BE', 'CH-BL', 'CH-BS', 'CH-GE', 'CH-GR', 'CH-LU', 'CH-NE', 'CH-SG', 'CH-SH', 'CH-TI', 'CH-ZG', 'CH-ZH', 'CHIL', 'CHIN', 'CMPN', 'CONN', 'CRIM', 'CROA', 'CUBA', 'CYPR', 'CZ-10', 'CZEC', 'DALM', 'DE-BB', 'DE-BE', 'DE-BW', 'DE-BY', 'DE-HB', 'DE-HE', 'DE-HH', 'DE-MV', 'DE-NI', 'DE-NW', 'DE-RP', 'DE-SH', 'DE-SL', 'DE-SN', 'DE-ST', 'DE-TH', 'DENK', 'DMRP', 'EFRE', 'EGYP', 'EMRM', 'ENGL', 'ES-AN'])
Country of ES-AR: Aragon (Spain) 
{'region_original': 'Aragon (Spain)', 'region_extended': 'Aragon (Spain)', 'region_country': 'Spain'}
Country of ES-AS: Asturias (Spain) 
{'region_original': 'Asturias (Spain)', 'region_extended': 'Asturias (

{'region_original': 'Franconia (Germany)', 'region_extended': 'Franconia (Germany)', 'region_country': 'Germany'}
Country of FVEG: Friuli-Venezia Giulia Italy
{'region_original': 'Friuli-Venezia Giulia', 'region_extended': 'Friuli-Venezia Giulia (Italy)', 'region_country': 'Italy'}
Country of GEOR: Georgia 
{'region_original': 'Georgia', 'region_extended': 'Georgia', 'region_country': 'Georgia'}
Country of GERM: Germany 
{'region_original': 'Germany', 'region_extended': 'Germany', 'region_country': 'Germany'}
Country of GIBR: Gibraltar United Kingdom
{'region_original': 'Gibraltar', 'region_extended': 'Gibraltar (United Kingdom)', 'region_country': 'Gibraltar (United Kingdom)'}
Country of GNLD: Greenland Denmark
{'region_original': 'Greenland', 'region_extended': 'Greenland (Denmark)', 'region_country': 'Denmark'}
Country of GREE: Greece 
{'region_original': 'Greece', 'region_extended': 'Greece', 'region_country': 'Greece'}
Country of GUER: Guernsey United Kingdom
{'region_original': '

{'region_original': 'North Brabant (Netherlands)', 'region_extended': 'North Brabant (Netherlands)', 'region_country': 'Netherlands'}
Country of NL-NH: North Holland (Netherlands) 
{'region_original': 'North Holland (Netherlands)', 'region_extended': 'North Holland (Netherlands)', 'region_country': 'Netherlands'}
Country of NL-OV: Overijssel (Netherlands) 
{'region_original': 'Overijssel (Netherlands)', 'region_extended': 'Overijssel (Netherlands)', 'region_country': 'Netherlands'}
Country of NL-UT: Utrecht (Netherlands) 
{'region_original': 'Utrecht (Netherlands)', 'region_extended': 'Utrecht (Netherlands)', 'region_country': 'Netherlands'}
Country of NL-ZE: Zeeland (Netherlands) 
{'region_original': 'Zeeland (Netherlands)', 'region_extended': 'Zeeland (Netherlands)', 'region_country': 'Netherlands'}
Country of NL-ZH: South Holland (Netherlands) 
{'region_original': 'South Holland (Netherlands)', 'region_extended': 'South Holland (Netherlands)', 'region_country': 'Netherlands'}
Countr

{'region_original': 'Syria', 'region_extended': 'Syria', 'region_country': 'Syria'}
Country of TAIW: Taiwan 
{'region_original': 'Taiwan', 'region_extended': 'Taiwan', 'region_country': 'Taiwan'}
Country of TRAN: Transylvania Romania
{'region_original': 'Transylvania', 'region_extended': 'Transylvania (Romania)', 'region_country': 'Romania'}
Country of TRNT: Trentino-South Tyrol Italy
{'region_original': 'Trentino-South Tyrol', 'region_extended': 'Trentino-South Tyrol (Italy)', 'region_country': 'Italy'}
Country of TUNI: Tunisia 
{'region_original': 'Tunisia', 'region_extended': 'Tunisia', 'region_country': 'Tunisia'}
Country of TURK: Turkey 
{'region_original': 'Turkey', 'region_extended': 'Turkey', 'region_country': 'Turkey'}
Country of TUSC: Tuscany Italy
{'region_original': 'Tuscany', 'region_extended': 'Tuscany (Italy)', 'region_country': 'Italy'}
Country of UA-05: Ukraine 
{'region_original': 'Ukraine', 'region_extended': 'Ukraine', 'region_country': 'Ukraine'}
Country of UA-07: 

In [27]:
with open(OUTPUT_FILE, "w", encoding="utf8") as fp:
    json.dump(flairs, fp, ensure_ascii=False, indent=4)

In [44]:
input_flairs = list()
with open(INPUT_FILE, "r") as fp:
    for line in fp:
        input_flairs.append(json.loads(line)["key"])
        
skipped_flairs = [flair for flair in input_flairs if flair not in flairs]
skipped_flairs += ["CRIM", "MORO"]

print(skipped_flairs)

['ABKH', 'ANDO', 'AOST', 'ARUB', 'BERM', 'BKFG', 'CH-AI', 'CH-SO', 'CURA', 'DENK FORT', 'DOG', 'EART', 'ECUA', 'ES-RI', 'EURO', 'EURO STAR', 'FR-GUAD', 'FR-PACA FORT', 'FT', 'GAGA', 'KASH', 'KURD', 'MYAN', 'NAGO', 'NIST', 'OCCI', 'RROM', 'RUSS STAR', 'SAMI', 'SAUD', 'SI-050', 'SOMA', 'SRPS', 'SV', 'SZEK', 'UA-71', 'ULST', 'YEME', 'CRIM', 'MORO']


In [43]:
with open(RAW_FLAIR_FILE, "rb") as fp:
    raw_flairs = pickle.load(fp)

for key in skipped_flairs:
    print(key)
    counter = dict(sorted(raw_flairs[key].items(), key=lambda x: x[1], reverse=True))
    for flair_text, n in counter.items():
        print(f"\t({flair_text}: {n})")

ABKH
	(Half-Abkhazian half-Swede in Gotland: 35)
	(Half Abkhaz Half Crimean Tatar: 5)
ANDO
	(🇸🇰🇨🇿: 15)
	(Bavarian European 🇪🇺: 1)
	(Catalan Republic: 1)
AOST
	(RBiH: 2)
	(Socialist Utopia: 2)
	(Anarchist: 1)
ARUB
	(East Frisia: 28)
	(Aruba: 12)
	(East Frisian: 8)
	(Само Слога Србина Спасава: 6)
	(Lithuania: 1)
BERM
	(Lithium Anus -ia: 39)
BKFG
	(Anarchist: 2)
CH-AI
	(vcxz: 2)
CH-SO
	(Irnbru for ever 🏴󠁧󠁢󠁳󠁣󠁴󠁿: 4)
CURA
	(Silesia (CZ): 20)
DENK FORT
	(Denmark: 2)
DOG
	(He does it for free: 16)
EART
	(Earth: 2280)
	(Republic of Türkiye: 114)
	(НАТОвской мордой: 89)
	(UA/US/EE/AT/FR/ES: 85)
	(Ceterum autem censeo Unionem Europaeam esse delendam: 67)
	(Life, Liberty and Property: 66)
	(Europe: 60)
	(Слава Україні: 59)
	(Slovenia, EU: 51)
	(Zealand: 41)
	(Republic of Turkiye: 41)
	(Geo-anthropoma: 37)
	(Citizen of the World: 35)
	(Baryonic Middle Finger: 34)
	(Autocracy with European characteristics: 31)
	(Somewhere in Asia: 24)
	(Dutchman living in Hong Kong: 23)
	(Bosnian Kingdom: 21)
	(Puck