<a href="https://colab.research.google.com/github/IvanDePivan/2AMM30-groep-2-component-1/blob/main/cleaning_nobel_articles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
!pip install ftfy
!pip install srsly




In [51]:
from pathlib import Path
import ftfy
import random
import srsly
import os
import re
from tqdm import tqdm

In [33]:
from google.colab import auth
auth.authenticate_user()

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [34]:
# Get email of current Colab user
import requests
gcloud_token = !gcloud auth print-access-token
gcloud_tokeninfo = requests.get('https://www.googleapis.com/oauth2/v3/tokeninfo?access_token=' + gcloud_token[0]).json()
email = gcloud_tokeninfo['email']

In [35]:
# Define notebook directory
if email == 'tamaraexterkate93@gmail.com':
  notebook_dir = "/content/drive/MyDrive/TUe/TM/Data/enwiki20230820-stripped-json"
else:
  notebook_dir = 'drive/MyDrive/enwiki20230820/raw/'

print(notebook_dir)

/content/drive/MyDrive/TUe/TM/Data/enwiki20230820-stripped-json


In [40]:
# Generate a list of file paths
if email == 'tamaraexterkate93@gmail.com':
  pathlist = [os.path.join(root, file) for root, dirs, files in os.walk(notebook_dir) for file in files]
else:
  pathlist = [p for p in Path(filepath).glob('**/*') if p.is_file()]

In [70]:
# Search terms
# search_terms = ['nobel', 'prize', 'won|laureate']
search_terms = ['nobel', 'prize']

In [71]:
# Read files using srsly (generators)
from itertools import chain

def get_json_readers():
  return chain.from_iterable(srsly.read_jsonl(path) for path in pathlist)

def read_random_json():
  data = srsly.read_jsonl(random.choice(pathlist))
  articles = [article for article in data if len(article["text"]) > 0]
  articles = [article for article in articles if (re.search(term, article["text"], re.IGNORECASE) for term in search_terms)]

  return articles

def read_all_json():
  data = get_json_readers()
  articles = [article for article in data if len(article["text"]) > 0 and all(re.search(term, article["text"]) is not None for term in search_terms)]
  print(f'found {len(articles)} nonempty nobel prize laureate articles')
  return articles

In [102]:
def clean_text(text):
  # Note: the order of these regexes matter, due to '\n' -> ' ' for instance resulting in repeated spaces
  url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'


  ordered_list = re.compile(r'^[A-Z0-9]{1,2}\.', re.MULTILINE)
  cleaned_text = ftfy.fix_text(text)
  cleaned_text = re.sub('\(([;,] ?)+\)', '\(', cleaned_text) # Removes artifacts from within parentheses e.g. (; ; born in etc) -> (born in etc)
  cleaned_text = re.sub('\(.?\)', '', cleaned_text) # Remove empty or 1 character parentheses e.g. "( )"
  cleaned_text = re.sub('\n', ' ', cleaned_text) # Remove newlines
  cleaned_text = re.sub(ordered_list, '', cleaned_text) # Removes the first part of an ordered list (e.g. A. the cheese ->  the cheese)
  cleaned_text = re.sub(' {2,}', ' ', cleaned_text) # Remove repeated spaces
  cleaned_text = re.sub('[“”]', '"', cleaned_text) # Remove smart double quotes, might also need one for smart single quotes
  cleaned_text = re.sub('([12]\d)(\d{2})[–\/](\d{2}\D)', r'\1\2-\1\3', cleaned_text)
  cleaned_text = re.sub(url_pattern, '', cleaned_text)

  return cleaned_text

In [103]:
# # Not needed if you have the .json
articles = read_all_json()
srsly.write_json(os.path.join(notebook_dir, 'nobel_articles.json'), articles)

In [113]:
articles = srsly.read_json(os.path.join(notebook_dir, 'nobel_articles.json'))

In [114]:
articles_cleaned = [{key: (clean_text(value) if key == "text" else value) for key, value in article.items()} for article in articles]

In [115]:
text = [article['text'] for article in articles_cleaned]
text

['Alfred Bernhard Nobel ( , ; 21 October 1833\xa0– 10 December 1896) was a Swedish chemist, engineer, inventor, businessman, and philanthropist. He is known for creating dynamite as well as having bequeathed his fortune to establish the Nobel Prize, though he also made several important contributions to science, holding 355 patents in his lifetime. Nobel\'s most famous invention was dynamite, an explosive using nitroglycerin; it was patented in 1867. Nobel displayed an early aptitude for science and learning, particularly in chemistry and languages; he became fluent in six languages and filed his first patent at the age of 24. He embarked on many business ventures with his family, most notably owning the company Bofors, which was an iron and steel producer that he had developed into a major manufacturer of cannons and other armaments. Nobel was later inspired to donate his fortune to the Nobel Prize institution, which would annually recognize those who "conferred the greatest benefit t

In [116]:
# write cleaned articles to file
srsly.write_json(os.path.join(notebook_dir, 'nobel_articles_cleaned.json'), articles_cleaned)