## Loading the required libraries

In [None]:
import re
import gzip
import spacy
import json

In [None]:
# Google drive mount
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Decompression and reading a Wikipedia dump text file with gzip

In [None]:
text_file = 'drive/MyDrive/Colab Notebooks/wiki.txt.gz'
# read all lines at once
with gzip.open(text_file, 'rb') as f:
    lines = f.readlines()

In [None]:
lines[:1]

[b'David Stagg\tdavid stagg born october in townsville queensland is an australian former professional rugby league footballer he made one appearance for the queensland state of origin side and played for the brisbane broncos with whom he won the nrl premiership and the canterbury bankstown bulldogs he was known for his high workload and played as and but could also fill in at career stagg played his junior football for norms trl before joining the brisbane broncos he made his nrl debut in round of the nrl season against the canterbury bankstown bulldogs in stagg set new record for tackles in game with tackles made against the cronulla sutherland sharks this record has since been beaten in stagg made his representative debut and played only one game for queensland in state of origin before being dropped later that year he played at centre in the broncos nrl grand final victory after winning the grand final with the broncos stagg signed two year deal with the canterbury bankstown bulldo

## Preprocessing loaded text

The loaded text has been pre-processed using regular expressions. Numbers, special characters, single characters, spaces at the beginning and end of a line, multiple spaces have been removed, and text has been converted to lowercase.

In [None]:
# Function for preprocessing loaded text
def clean_text(text):
    # remove numbers and digits
    text = re.sub(r'(\d+\s\d+)|(\d+)','', text)
    # remove all the special characters
    text = re.sub(r'\W', ' ', str(text))
    # remove all single characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    # remove single characters from the start
    text = re.sub(r'\^[a-zA-Z]\s+', ' ', text)
    # remove spaces at the beginning and end of a string
    text = text.strip()
    # converting to lowercase
    text = text.lower()
    # remove multiple spaces
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    # returning cleared data
    return text  

In [None]:
# create a list to hold the cleared text
cleaned_text = []
# for each word from all lines
for text in lines:
    # cleaning up the text  
    text = clean_text(text.decode('utf-8'))
    # add the cleared text to the list cleaned_text
    cleaned_text.append(text)

In [None]:
cleaned_text[:1]

['david stagg david stagg born october in townsville queensland is an australian former professional rugby league footballer he made one appearance for the queensland state of origin side and played for the brisbane broncos with whom he won the nrl premiership and the canterbury bankstown bulldogs he was known for his high workload and played as and but could also fill in at career stagg played his junior football for norms trl before joining the brisbane broncos he made his nrl debut in round of the nrl season against the canterbury bankstown bulldogs in stagg set new record for tackles in game with tackles made against the cronulla sutherland sharks this record has since been beaten in stagg made his representative debut and played only one game for queensland in state of origin before being dropped later that year he played at centre in the broncos nrl grand final victory after winning the grand final with the broncos stagg signed two year deal with the canterbury bankstown bulldogs

## Tokenization of texts by words in SpaCy

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
cleaned_text_tokenized = []
for text in cleaned_text:
    doc = nlp(text)
    cleaned_text_tokenized.append([word.text for word in doc])

In [None]:
print(cleaned_text_tokenized[0])

['david', 'stagg', 'david', 'stagg', 'born', 'october', 'in', 'townsville', 'queensland', 'is', 'an', 'australian', 'former', 'professional', 'rugby', 'league', 'footballer', 'he', 'made', 'one', 'appearance', 'for', 'the', 'queensland', 'state', 'of', 'origin', 'side', 'and', 'played', 'for', 'the', 'brisbane', 'broncos', 'with', 'whom', 'he', 'won', 'the', 'nrl', 'premiership', 'and', 'the', 'canterbury', 'bankstown', 'bulldogs', 'he', 'was', 'known', 'for', 'his', 'high', 'workload', 'and', 'played', 'as', 'and', 'but', 'could', 'also', 'fill', 'in', 'at', 'career', 'stagg', 'played', 'his', 'junior', 'football', 'for', 'norms', 'trl', 'before', 'joining', 'the', 'brisbane', 'broncos', 'he', 'made', 'his', 'nrl', 'debut', 'in', 'round', 'of', 'the', 'nrl', 'season', 'against', 'the', 'canterbury', 'bankstown', 'bulldogs', 'in', 'stagg', 'set', 'new', 'record', 'for', 'tackles', 'in', 'game', 'with', 'tackles', 'made', 'against', 'the', 'cronulla', 'sutherland', 'sharks', 'this', 're

## Serializing result to json-file

In [None]:
with open('drive/MyDrive/Colab Notebooks/cleaned_text_tokenized.json', 'w') as f:
    json.dump(cleaned_text_tokenized, f, indent=4)