<a href="https://colab.research.google.com/github/IvanDePivan/2AMM30-groep-2-component-1/blob/main/2AMM30_Text_Mining_1_chess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# not currently using pandas, this option is for forcing a newer version of pandas which has faster processing
# !pip install pandas --force-reinstall

In [None]:
# !pip install spacy
# !pip install spacy-cleaner
!pip install ftfy
!pip install spacy-transformers
!pip install SPARQLWrapper
!pip install srsly


In [None]:
!python -m spacy download en_core_web_trf

In [None]:
# import pandas as pd
# import pyarrow as pa
import json
from pathlib import Path
import spacy_transformers
import spacy
import ftfy
import random
import srsly
import re
from spacy import displacy
from spacy.pipeline.spancat import DEFAULT_SPANCAT_MODEL

In [None]:
pathlist = [p for p in Path('drive/MyDrive/enwiki20230820/raw/').glob('**/*') if p.is_file()]
nlp = spacy.load('en_core_web_trf')
titles = []
with open('drive/MyDrive/enwiki20230820/chess_article_titles_no_dupes.txt') as f:
  for l in f:
    titles.append(ftfy.fix_text(l.strip()))

print(f'{len(titles)} titles')
pattern = '|'.join(titles)

6941 titles


In [None]:
# Read files using srsly (generators)
from itertools import chain

def get_json_readers():
  return chain.from_iterable(srsly.read_jsonl(path) for path in pathlist)

def read_random_json():
  data = srsly.read_jsonl(random.choice(pathlist))
  articles = [article for article in data if len(article["text"]) > 0 and re.fullmatch(pattern, article["title"])]

  return articles

def read_all_json():
  articles = []
  count_json_files = 0
  count_articles = 0
  generators = get_json_readers()

  for data in generators:
    articles.append([article for article in data if len(article["text"]) > 0 and re.fullmatch(pattern, article["title"])])
    count_json_files += 1
    count_articles += len(articles[-1])

    if count_json_files % 10 == 0:
      print(f'found {count_articles} nonempty chess articles so far in {count_json_files}/{len(pathlist)} files')

  print(f'found {count_articles} nonempty chess articles in {count_json_files}/{len(pathlist)} files')
  return articles


In [None]:
nonempty_chess = Path('drive/MyDrive/enwiki20230820/chess_nonempty.json')
# df = pd.read_json(
#     nonempty_chess_csv_path,
#     lines=True,
#     engine='pyarrow',
#     dtype={"title": pd.ArrowDtype(pa.string()), "text": pd.ArrowDtype(pa.string())}
# )
srsly.read_jsonl(nonempty_chess)


<generator object read_jsonl at 0x7fdc772f6b20>

In [None]:
# nlp = spacy.load('en_core_web_trf')
#spancat config
config = {
    #this refers to the minimum probability to consider a prediction positive
    "threshold": 0.5,
    #this refers to the maximum number of labels to consider positive per span
    "max_positive": None,
     #a model instance that is given a list of documents with start end indices representing the labelled spans
    "model": DEFAULT_SPANCAT_MODEL,
    #A function that suggests spans. This suggester is fixed n-gram length of up to 3 tokens
    "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
}
from spacy.lookups import Lookups

#add spancat component to nlp object
# nlp.add_pipe("spancat")
# nlp.initialize()

In [None]:
regexeses = ["(\d{1,3}\.)?([KQBNPR]?[abcdefghx]+[12345678]\+?\!?\??#?)|O-O(-O)?|\d\-\d", # Removes chess notation
             '^(\d+\. *)+', # Removes numbered list indicatords (e.g. 1. 2. etc)
             '^\+', # Removes the + at the start of some sentences
             '\(.?\)'] # Removes empty/1 character parenthesis

reg = '|'.join(regexeses)
empty_parenthesis_regex = '\(([;,] ?)+' # Removes the artifacts left behind from some previous filtering (e.g. "(; ; Thomas edison)" -> "(Thomas edison)"")
def clean_text(text):
  sentences = ftfy.fix_text(text)
  sentences = re.sub(reg, '', sentences)
  sentences = re.sub('([12]\d)(\d{2}–)(\d{2}\D)', '\1\2\1\3', sentences) # expands a sequence containing 4 numbered years followed by 2 numbered years. (e.g. 1976-78 -> 1976-1978)
  sentences = re.sub(empty_parenthesis_regex, '(', sentences)
  sentences = re.sub('\n.?\n', '\n', sentences) # remove 1 character lines
  sentences = re.sub('\n', ' ', sentences) # Remove newlines
  sentences = re.sub(' +', ' ', sentences) # Remove weird lines that only have a +
  return sentences

def clean_json(obj):
  obj["text"] = clean_text(obj["text"])
  return obj

In [None]:
articles = read_random_json()
text = random.choice(articles)["text"]
new_text = clean_text(text)

In [None]:
doc = nlp(new_text)
displacy.render(doc, style='span', jupyter=True, options={'distance': 88})


In [None]:
"""
'chess Grandmaster' or just 'Grandmaster' is not recognized as an entity. In Chess, titles are important
Lost of cardinal values are unusable in current spans
'International Master' is not recognized

"""

In [None]:
new_text


In [None]:
# This code takes more than an hour
# data = (clean_text(json["text"]) for json in get_json_readers())
# result = nlp.pipe(data, n_process=-1, batch_size=1000)

# srsly.write_gzip_jsonl('/somefile.gz', result)