In [4]:
import gensim
import os
import Cython
import spacy
import numpy as np
import pandas as pd

<div class="alert-warning">
Yellow is used for parts of the code which are irrelevant and perform, for example, pre-processing operations.
</div>

<div class="alert-info">
Blue is used for the relevant parts of the code.
</div>

-----

In [2]:
source_dir = 'D:/Data/Large_10K_corpus/'

In [3]:
dest_dir = 'D:/temp3/'

<div class="alert-info">
Load the Spacy model
</div>

In [4]:
nlp = spacy.load('en_core_web_lg')

<div class="alert-info">
We add "merge entities" module to the pipeline to connect entities that consist of several words.
</div>

In [5]:
nlp.add_pipe('merge_entities')

<function spacy.pipeline.functions.merge_entities(doc: spacy.tokens.doc.Doc)>

<div class="alert-warning">
Collect the remaining files that have not yet been processed
</div>

In [6]:
files1 = os.listdir(source_dir)

In [7]:
files2 = os.listdir(dest_dir)

In [9]:
remaining_files = np.setdiff1d(files1,files2)

<div class="alert-info">
Algorithm for replacing named entities with a tag ner_(type of named entity)
</div>

In [11]:
for fname in remaining_files:
    raw = open(os.path.join(source_dir, fname)).read().split('</Header>')[1]
    raw = raw[500:1000000].lower()
    raw = " ".join(gensim.utils.simple_preprocess(raw))
    doc=nlp(raw,disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"])
    open(os.path.join(dest_dir + fname),mode='w').write(' '.join([t.text if not t.ent_type_ else 'ner_' + t.ent_type_ for t in doc]))

---

In [2]:
#spacy.require_gpu()

In [3]:
source_dir = 'D:/temp3/'

In [4]:
dest_dir = 'D:/NOUN_CHUNKS/'

In [5]:
nlp = spacy.load('en_core_web_lg')

<div class="alert-info">
Add module to the pipeline that creates noun chunks.
</div>

In [6]:
nlp.add_pipe('merge_noun_chunks')

<function spacy.pipeline.functions.merge_noun_chunks(doc: spacy.tokens.doc.Doc) -> spacy.tokens.doc.Doc>

In [7]:
files1 = os.listdir(source_dir)

In [8]:
files2 = os.listdir(dest_dir)

In [9]:
import numpy as np

In [10]:
remaining_files = np.setdiff1d(files1,files2)

<div class="alert-info">
Combine the words of noun chunks with '_'
</div>

In [13]:
for fname in remaining_files:
    raw = open(os.path.join(source_dir, fname)).read()
    doc=nlp(raw,disable=["lemmatizer","ner"])
    open(os.path.join(dest_dir + fname),mode='w').write(' '.join([t.text.replace(' ','_') for t in doc]))

---

<div class="alert-info">
Use a word2vec model, trained with 10-Ks, to infer most similar words to specified keywords.
</div>

<div class="alert-info">
Main routine to create the word2vec model.
</div>

In [54]:
class MySentences(object):
    def __init__(self, dirname):
        self.dirname = dirname
 
    def __iter__(self):
        for fname in os.listdir(self.dirname):
            raw = open(os.path.join(self.dirname, fname)).read()
            raw = raw.lower()
            raw = raw.replace('non-gaap','nongaap')
            raw = raw.replace('non gaap','nongaap')
            raw = raw.replace('pro-forma','proforma')
            raw = raw.replace('pro forma','proforma')
            yield gensim.utils.simple_preprocess(raw)

In [9]:
docs_10K = MySentences(source_dir)

In [10]:
model = gensim.models.Word2Vec(docs_10K)

<div class="alert-info">
The 50 closest words the keywpords 'sustainability' and 'climate_change'
</div>

In [None]:
new_model.wv.most_similar(positive=['sustainability','climate_change'],topn=50)

<div class="alert-info">
Calculate the centroid vector from the word vectors representing words 'restructuring','rationalization','downsizing','resizing','plant_closures'
</div>

In [146]:
word_list = ['restructuring','rationalization','downsizing','resizing','plant_closures']
restr_centroid = np.zeros(100)
for word in word_list:
    restr_centroid = np.add(restr_centroid, new_model.wv[word])

In [147]:
restr_centroid = restr_centroid/5

<div class="alert-info">
Collect the 100 word vectors that are cloesest to the centroid
</div>

In [148]:
restr_keywords = [word for (word,_) in new_model.wv.most_similar(positive=restr_centroid,topn=100)]

<div class="alert-info">
Calculate the occurence of words in 10-Ks
</div>

In [149]:
docs_10K = MySentences(source_dir)

In [150]:
count_list = []
for doc in docs_10K:
    temp_sum = 0
    for word in restr_keywords:
        temp_sum+=doc.count(word)
    count_list.append(temp_sum)