<a href="https://colab.research.google.com/github/IvanDePivan/2AMM30-groep-2-component-1/blob/main/cleaning_nobel_articles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install ftfy
!pip install srsly


Collecting ftfy
  Downloading ftfy-6.1.1-py3-none-any.whl (53 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/53.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m51.2/53.1 kB[0m [31m1.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: ftfy
Successfully installed ftfy-6.1.1


In [3]:
from pathlib import Path
import ftfy
import random
import srsly
import re

In [4]:
pathlist = [p for p in Path('drive/MyDrive/enwiki20230820/raw/').glob('**/*') if p.is_file()]
search_terms = ['nobel', 'prize', 'won|laureate']

In [5]:
# Read files using srsly (generators)
from itertools import chain

def get_json_readers():
  return chain.from_iterable(srsly.read_jsonl(path) for path in pathlist)

def read_random_json():
  data = srsly.read_jsonl(random.choice(pathlist))
  articles = [article for article in data if len(article["text"]) > 0]
  articles = [article for article in articles if (re.search(term, article["text"]) for term in search_terms)]

  return articles

def read_all_json():
  data = get_json_readers()
  articles = [article for article in data if len(article["text"]) > 0 and all(re.search(term, article["text"]) is not None for term in search_terms)]
  print(f'found {len(articles)} nonempty nobell prize laureate articles')
  return articles


In [6]:
# Not needed if you have the .json
articles = read_all_json()
srsly.write_json('drive/MyDrive/enwiki20230820/nobel_articles.json', articles)

found 42 nonempty nobell prize laureate articles


In [7]:
articles = srsly.read_json('drive/MyDrive/enwiki20230820/nobel_articles.json')

In [8]:
# Pick a random article to test cleaning on
article = random.choice(articles)

In [9]:
# Note: the order of these regexes matter, due to '\n' -> ' ' for instance resulting in repeated spaces
ordered_list = re.compile(r'^[A-Z0-9]{1,2}\.', re.MULTILINE)
cleaned_text = ftfy.fix_text(article['text'])
cleaned_text = re.sub('\(([;,] ?)+\)', '\(', cleaned_text) # Removes artifacts from within parentheses e.g. (; ; born in etc) -> (born in etc)
cleaned_text = re.sub('\(.?\)', '', cleaned_text) # Remove empty or 1 character parentheses e.g. "( )"
cleaned_text = re.sub('\n', ' ', cleaned_text) # Remove newlines
cleaned_text = re.sub(ordered_list, '', cleaned_text) # Removes the first part of an ordered list (e.g. A. the cheese ->  the cheese)
cleaned_text = re.sub(' {2,}', ' ', cleaned_text) # Remove repeated spaces
cleaned_text = re.sub('[“”]', '"', cleaned_text) # Remove smart double quotes, might also need one for smart single quotes
cleaned_text = re.sub('([12]\d)(\d{2})[–\/](\d{2}\D)', r'\1\2-\1\3', cleaned_text)
print(cleaned_text)

#TODO write to file

Johann de Lange (born 22 December 1959 in Pretoria, Union of South Africa) is an Afrikaans poet, short story writer and critic. He is renowned for being one of the foremost gay writers in Afrikaans, his most controversial book being "Nagsweet" ("Night sweat"). Writing career. He debuted in 1982 with a collection of poetry titled "Akwarelle van die dors" ("Aquarelles of thirst") for which he was awarded the Ingrid Jonker prize in 1983. This was followed by "Waterwoestyn" ("Water desert") in 1984, "Snel grys fantoom" ("Quick grey phantom") in 1986, "Wordende naak" ("Changing") in 1988 which was awarded the Rapport Prize for Poetry, "Nagsweet" ("Nightsweat") in 1990, "Vleiswond" ("Flesh wound") in 1993 and "Wat sag is vergaan" ("That which is soft perishes") in 1995. After a silence of 13 years he published a new volume of poetry "Die algebra van nood" ("The algebra of need") in 2009, which was awarded the Hertzog Prize for Poetry in 2011. In 2010 a selection from his poetry was published