## Import all the necessary packages

In [None]:
import json
import spacy
import gensim
from gensim import corpora

In [None]:
# Google drive mount
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Loading tokenized articles

In [None]:
docs = 'drive/MyDrive/Colab Notebooks/cleaned_text_tokenized.json'
# read all lines at once
with open(docs, 'r') as f:
    cleaned_text_tokenized = json.load(f)

In [None]:
print(cleaned_text_tokenized[0])

['david', 'stagg', 'david', 'stagg', 'born', 'october', 'in', 'townsville', 'queensland', 'is', 'an', 'australian', 'former', 'professional', 'rugby', 'league', 'footballer', 'he', 'made', 'one', 'appearance', 'for', 'the', 'queensland', 'state', 'of', 'origin', 'side', 'and', 'played', 'for', 'the', 'brisbane', 'broncos', 'with', 'whom', 'he', 'won', 'the', 'nrl', 'premiership', 'and', 'the', 'canterbury', 'bankstown', 'bulldogs', 'he', 'was', 'known', 'for', 'his', 'high', 'workload', 'and', 'played', 'as', 'and', 'but', 'could', 'also', 'fill', 'in', 'at', 'career', 'stagg', 'played', 'his', 'junior', 'football', 'for', 'norms', 'trl', 'before', 'joining', 'the', 'brisbane', 'broncos', 'he', 'made', 'his', 'nrl', 'debut', 'in', 'round', 'of', 'the', 'nrl', 'season', 'against', 'the', 'canterbury', 'bankstown', 'bulldogs', 'in', 'stagg', 'set', 'new', 'record', 'for', 'tackles', 'in', 'game', 'with', 'tackles', 'made', 'against', 'the', 'cronulla', 'sutherland', 'sharks', 'this', 're

## Creating a list of stop words
Create an object of corpora.Dictionary() (by Gensim), create a dictionary of unique words to get a list of stopwords

In [None]:
temp_dictionary = corpora.Dictionary(cleaned_text_tokenized)

Getting a list of values from a dictionary of unique words

In [None]:
list_val = list(temp_dictionary.values())

Identification of non-English words in the dictionary

In [None]:
non_english_words = [word for word in list_val if not word.isascii()]

In [None]:
print(non_english_words[:10])

['antigüedad', 'históricas', 'josé', 'monografías', 'nº', 'sánchez', 'tardía', 'alegría', 'जग', 'वण']


Additional list of stopwords

In [None]:
addit_list = ["aa", "aaa", "aaaa", "aaaaa", "b", "c", "c_id", "ch", "d", 'dl', 'dz', "e", 'ee', 'ei', "f", "g", 'gh', 'gw', 'ghw', "h", 'http', "j", "k", 'kh', 'kw', 'khw', "l", 
       "m", "n", "o", 'oo', "p", "r", "s", 'sh', "t", 'tl', 'ts', "u", "u_id", "v", "w", 'www', "x", 'xh', 'xhw', "xv", "xvi", "xvii", "xviii", "xw", "xx", "xxi", "xxii", 
       "xxiii", "xxiv", "xxix", "xxl", "xxv", "xxvi", "xxvii", "xxviii", "xxx", "xxxi", "xxxii", "xxxiii", "xxxiv", "xxxix", "xxxv", "xxxvi", "xxxvii", "xxxviii", 
       "xxxx", "xy", "zz"]

Complete list of stop words

In [None]:
stopwords = non_english_words + addit_list

## Removing stop words in SpaCy

In [None]:
nlp = spacy.load('en_core_web_sm')

Adding the created stopword list to the standard stopword list in SpaCy

In [None]:
all_stopwords = nlp.Defaults.stop_words
all_stopwords |= set(stopwords)

Clearing articles from stopwords

In [None]:
cleaned_text_tokenized_sw = []
for words in cleaned_text_tokenized:
    cleaned_text_tokenized_sw.append([word for word in words if not word in all_stopwords])

In [None]:
print(cleaned_text_tokenized_sw[0])

['david', 'stagg', 'david', 'stagg', 'born', 'october', 'townsville', 'queensland', 'australian', 'professional', 'rugby', 'league', 'footballer', 'appearance', 'queensland', 'state', 'origin', 'played', 'brisbane', 'broncos', 'won', 'nrl', 'premiership', 'canterbury', 'bankstown', 'bulldogs', 'known', 'high', 'workload', 'played', 'fill', 'career', 'stagg', 'played', 'junior', 'football', 'norms', 'trl', 'joining', 'brisbane', 'broncos', 'nrl', 'debut', 'round', 'nrl', 'season', 'canterbury', 'bankstown', 'bulldogs', 'stagg', 'set', 'new', 'record', 'tackles', 'game', 'tackles', 'cronulla', 'sutherland', 'sharks', 'record', 'beaten', 'stagg', 'representative', 'debut', 'played', 'game', 'queensland', 'state', 'origin', 'dropped', 'later', 'year', 'played', 'centre', 'broncos', 'nrl', 'grand', 'final', 'victory', 'winning', 'grand', 'final', 'broncos', 'stagg', 'signed', 'year', 'deal', 'canterbury', 'bankstown', 'bulldogs', 'season', 'canterbury', 'stagg', 'played', 'games', 'club', '

Saving tokenized text cleared of stopwords in json

In [None]:
with open('drive/MyDrive/Colab Notebooks/cleaned_text_tokenized_sw.json', 'w') as f:
    json.dump(cleaned_text_tokenized_sw, f, indent=4)