In [1]:
# Imports
import glob
import gzip
import os
import sys
import tarfile
import zlib

In [2]:
# Setup tika
import tika
tika.initVM()
from tika import parser

In [6]:
# NLP/ML imports
import spacy
from gensim.models.word2vec import Word2Vec

# Iterate through tar files
nlp = spacy.load('en')

In [15]:
def extract_sentences(text):
    sentence_list = []
    
    # Create spacy document
    doc = nlp(text)
    for sentence in doc.sents:
        sentence_list.append([t.lemma_ for t in sentence if t.lemma_.isalnum()])

    return sentence_list

extract_sentences("This is an example sentence.  And yet another here.")

[['this', 'be', 'an', 'example', 'sentence'],
 ['and', 'yet', 'another', 'here']]

In [17]:
# Setup file
for year in range(1994, 1997):
    # Get file name and open
    file_name = "data/filings_10k_{0}.tar.gz".format(year)
    print(file_name)
    tar_file = tarfile.open(file_name)

    # Sample data
    sample_sentence_list = []

    print("building sample...")
    
    # Iterate through members
    j = 0
    for tar_member in tar_file.getmembers():
        if j % 1000 == 0:
            print((year, j, len(sample_sentence_list)))

        j += 1
        # Skip non-files
        if not tar_member.isfile():
            continue

        # Parse real files
        try:
            # Read tar data
            member_buffer = zlib.decompress(tar_file.extractfile(tar_member).read())

            # Send to tika
            filing_buffer = parser.from_buffer(member_buffer)
            if 'content' in filing_buffer:
                filing_buffer = filing_buffer['content']
            else:
                continue

            # Parse
            #print((tar_member, len(tar_data), len(filing_buffer), len(sample_sentence_list)))

            # Get sentence list
            sample_sentence_list.extend(extract_sentences(filing_buffer))

        except Exception as e:
            print(e)
    
    print("training w2v models...")
    # Train w2v CBOW model
    w2v_model_cbow = Word2Vec(sample_sentence_list, vector_size=200, window=20, min_count=10, workers=2)
    w2v_model_cbow.save("w2v_model_cbow_{0}".format(year))
    print("cbow trained.")
    
    # Train w2v SG model
    w2v_model_sg = Word2Vec(sample_sentence_list, vector_size=200, window=20, min_count=10, workers=2, sg=1)
    w2v_model_sg.save("w2v_model_sg_{0}".format(year))
    print("sg trained.")

data/filings_10k_1994.tar.gz
