<a href="https://colab.research.google.com/github/IvanDePivan/2AMM30-groep-2-component-1/blob/tamara/cleaning_nobel_articles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip install ftfy
!pip install srsly



In [5]:
from pathlib import Path
import ftfy
import random
import srsly
import os
import re
from tqdm import tqdm

In [6]:
from google.colab import auth
auth.authenticate_user()

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
# Get email of current Colab user
import requests
gcloud_token = !gcloud auth print-access-token
gcloud_tokeninfo = requests.get('https://www.googleapis.com/oauth2/v3/tokeninfo?access_token=' + gcloud_token[0]).json()
email = gcloud_tokeninfo['email']

In [8]:
# Define notebook directory
if email == 'tamaraexterkate93@gmail.com':
  notebook_dir = "/content/drive/MyDrive/TUe/TM/Data/enwiki20230820-stripped-json"
else:
  notebook_dir = 'drive/MyDrive/enwiki20230820/raw/'

print(notebook_dir)

/content/drive/MyDrive/TUe/TM/Data/enwiki20230820-stripped-json


In [9]:
# Generate a list of file paths
if email == 'tamaraexterkate93@gmail.com':
  pathlist = [os.path.join(root, file) for root, dirs, files in os.walk(notebook_dir) for file in files]
else:
  pathlist = [p for p in Path(notebook_dir).glob('**/*') if p.is_file()]

In [10]:
# Search terms
search_terms = ['nobel', 'prize']

In [11]:
# Read files using srsly (generators)
from itertools import chain

def get_json_readers():
  return chain.from_iterable(srsly.read_jsonl(path) for path in pathlist)

def read_random_json():
  data = srsly.read_jsonl(random.choice(pathlist))
  articles = [article for article in data if len(article["text"]) > 0]
  articles = [article for article in articles if (re.search(term, article["text"], re.IGNORECASE) for term in search_terms)]

  return articles

def read_all_json():
  data = get_json_readers()
  articles = [article for article in data if len(article["text"]) > 0 and all(re.search(term, article["text"], re.IGNORECASE) is not None for term in search_terms)]
  print(f'found {len(articles)} nonempty nobel prize laureate articles')
  return articles

def read_all_json_owen():
  data = get_json_readers()
  articles = [article for article in data if "Owen Willans Richardson" in article["title"]]
  print(f'found {len(articles)} nonempty nobel prize laureate articles')
  return articles

def read_all_json_title():
  data = get_json_readers()
  articles = [article for article in data if all(re.search(term, article["title"], re.IGNORECASE) is not None for term in search_terms)]
  print(f'found {len(articles)} nonempty nobel prize laureate articles')
  return articles

In [12]:
def clean_text(text):
  # Note: the order of these regexes matter, due to '\n' -> ' ' for instance resulting in repeated spaces
  url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'


  ordered_list = re.compile(r'^[A-Z0-9]{1,2}\.', re.MULTILINE)
  cleaned_text = ftfy.fix_text(text)
  cleaned_text = re.sub('\(([;,] ?)+\)', '\(', cleaned_text) # Removes artifacts from within parentheses e.g. (; ; born in etc) -> (born in etc)
  cleaned_text = re.sub('\(.?\)', '', cleaned_text) # Remove empty or 1 character parentheses e.g. "( )"
  cleaned_text = re.sub('\n', ' ', cleaned_text) # Remove newlines
  cleaned_text = re.sub(ordered_list, '', cleaned_text) # Removes the first part of an ordered list (e.g. A. the cheese ->  the cheese)
  cleaned_text = re.sub(' {2,}', ' ', cleaned_text) # Remove repeated spaces
  cleaned_text = re.sub('[“”]', '"', cleaned_text) # Remove smart double quotes, might also need one for smart single quotes
  cleaned_text = re.sub('([12]\d)(\d{2})[–\/](\d{2}\D)', r'\1\2-\1\3', cleaned_text)
  cleaned_text = re.sub(url_pattern, '', cleaned_text)

  return cleaned_text

In [40]:
# # Not needed if you have the .json
# articles = read_all_json()
# srsly.write_json(os.path.join(notebook_dir, 'nobel_articles.json'), articles)

In [41]:
articles = srsly.read_json(os.path.join(notebook_dir, 'nobel_data/nobel_articles.json'))
print(len(articles))

14980


In [42]:
term = "Physics|Chemistry|Physiology|Medicine|Literature|Peace"
articles = [article for article in articles if re.search(term, article["text"], re.IGNORECASE) is not None]
print(len(articles))

11893


In [43]:
#word 'nobel' has to be in it 3 or more times
articles = [article for article in articles if len(re.findall(r'nobel', article["text"], re.IGNORECASE)) >= 3]
print(len(articles))

2113


In [44]:
term = "born"
articles = [article for article in articles if re.search(term, article["text"], re.IGNORECASE) is not None]
print(len(articles))

1319


In [45]:
term = "awarded|laureate"
articles = [article for article in articles if re.search(term, article["text"], re.IGNORECASE) is not None]
print(len(articles))

1141


In [46]:
#test

In [39]:
titles = [article['title'] for article in articles]
for title in sorted(titles):
  print(title)

14th Dalai Lama
1944 Nobel Prize in Literature
1948 Nobel Prize in Literature
1963 Nobel Prize in Literature
1965 Nobel Peace Prize
1965 Nobel Prize in Literature
1972 Nobel Prize in Literature
1986 Nobel Prize in Literature
1993 Nobel Prize in Literature
20 złotych note
2000 Nobel Prize in Literature
2001 Nobel Prize in Literature
2002 Nobel Prize in Literature
2003 Nobel Prize in Literature
2004 Nobel Prize in Literature
2006 Nobel Prize in Literature
2008 Nobel Prize in Literature
2009 Nobel Prize in Literature
2010 Nobel Peace Prize
2010 Nobel Prize in Literature
2012 Nobel Prize in Literature
2014 Nobel Prize in Literature
2015 Nobel Prize in Literature
2016 Nobel Prize in Literature
2017 Nobel Prize in Literature
2018 Nobel Peace Prize
2018 Nobel Prize in Literature
2019 Nobel Prize in Literature
2020 Nobel Prize in Literature
2021 Nobel Prize in Literature
2021 Nobel Prize in Physiology or Medicine
2022 Nobel Peace Prize
2022 Nobel Prize in Physiology or Medicine
A Beautiful Min

In [16]:
def find_nobel_winners(text):
    str_length = len(text)
    if str_length < 600:
        return True
    else:
        length = max(600, 0.2*str_length)
        new_text = text[:int(length)]
        return all(re.search(term, new_text, re.IGNORECASE) is not None for term in search_terms)

In [17]:
articles = [article for article in articles if find_nobel_winners(article['text'])]
print(len(articles))

4341


In [18]:
#word 'nobel' has to be in it 3 or more times
articles = [article for article in articles if len(re.findall(r'nobel', article["text"], re.IGNORECASE)) >= 3]
print(len(articles))

ValueError: ignored

In [None]:
articles_cleaned = [{key: (clean_text(value) if key == "text" else value) for key, value in article.items()} for article in articles]

In [None]:
# write cleaned articles to file
srsly.write_json(os.path.join(notebook_dir, 'nobel_data/nobel_articles_cleaned.json'), articles_cleaned)

In [None]:
titles = [article['title'] for article in articles]
for title in sorted(titles):
  print(title)
