**Importing Libraries**

In [1]:
import os
import sys
import numpy as np
import pandas as pd
import string
import nltk
import threading # will potentially use multi-threading
from nltk.stem.porter import *
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer
from bs4 import BeautifulSoup

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/snehachoudhary/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


**Generating documents from smg files and TOKENIZING and Stemming**

In [2]:
def init_document():
    """ function: init_document
        -----------------------
        initialize new empty document skeleton
        :returns: dictionary @document of document fields
        @dictionary['topics'] is a list representing topic class labels
        @dictionary['places'] is a list representing place class labels
        @dictionary['words'] is a dictionary
        @dictionary['words']['title'] is a list for the title text terms
        @dictionary['words']['body'] is a list for the body text terms
    """
    #document = { 'topics' : [], 'places' : [], 'words' : dict([]) }
    document = { 'topics' : [], 'places' : [], 'title' : [], 'body' : []}
    #document['words']['title'] = []
    #document['words']['body']  = []
    return document

def populate_class_label(document, article):
    """ function: populate_class_label
        ------------------------------
        extract topics/places from @article and fill @document
        :param document: formatted dictionary object representing a document
        :param article:  formatted parse tree built from unformatted data
        @article is a 'reuter' child of the original file parsetree
    """
    for topic in article.topics.children:
        #print("before ",topic.text)
        #print("after ",topic.text.encode('ascii', 'ignore'))
        document['topics'].append(topic.text)
    for place in article.places.children:
        document['places'].append(place.text)

def populate_word_list(document, article):
    """ function: populate_word_list
        ----------------------------
        extract title/body words from @article, preprocess, and fill @document
        :param document: formatted dictionary object representing a document
        :param article:  formatted parse tree built from unformatted data
            @article is a 'reuter' child of the original file parsetree
    """
    text = article.find('text')
    title = text.title
    body = text.body
    if title != None:
        #print("title is ",title)
        #document['words']['title'] = title.text.split(' ')
        document['title'] = tokenize(title.text)
    if body != None:
        #print("body is ",body)
        document['body'] = tokenize(body.text)
        #document['words']['body'] = body.text.split(' ')

def tokenize(text):
    """ function: tokenize
        ------------------
        generate list of tokens given a block of @text;
        :param text: string representing text field (title or body)
        :returns: list of strings of tokenized & sanitized words
    """
    # encode unicode to string
    #ascii = text.encode('ascii', 'ignore')
    
    #remove digits
    
    translation_digits = str.maketrans(string.ascii_letters, string.ascii_letters, string.digits)
    no_digits = text.translate(translation_digits)
    #remove punctuation
    translation_punc = str.maketrans(string.ascii_letters, string.ascii_letters, string.punctuation)
    no_punctuation = no_digits.translate(translation_punc)
    # tokenize
    tokens = nltk.word_tokenize(no_punctuation)
    # remove stopwords - assume 'reuter'/'reuters' are also irrelevant
    no_stop_words = [w for w in tokens if not w in stopwords.words('english')]
    # filter out non-english words
    eng = [y for y in no_stop_words if wordnet.synsets(y)]
    # lemmatization process
    lemmas = []
    lmtzr = WordNetLemmatizer()
    for token in eng:
        lemmas.append(lmtzr.lemmatize(token))
    # stemming process
    stems = []
    stemmer = PorterStemmer()
    for token in lemmas:
        stems.append(stemmer.stem(token))
    # remove short stems
    terms = [x for x in stems if len(x) >= 4]
    return terms

def generate_document(text):
    """ function: generate_document
        ---------------------------
        extract class labels & tokenized (and sanitized) title/body text
        :param text: parsetree of 'reuter' child in original parsetree
        :returns: dictionary representing fields of single document entity
    """
    document = init_document()
    populate_class_label(document, text)
    populate_word_list(document, text)
    # UNCOMMENT WHEN DEBUGGING
    #print(document)
    return document

###############################################################################
############ function(s) for generating parse tree from .sgm files ############
###############################################################################

def generate_tree(text):
    """ function: generate_tree
        -----------------------
        extract well-formatted tree from poorly-formatted sgml @text
        :param text: string representing sgml text for a set of articles
        :returns: parsetree @tree of the structured @text
    """
    return BeautifulSoup(text, "html.parser")

###############################################################################
########## function(s) for generating parse trees & document objects ##########
###############################################################################

def parse_documents():
    """ function: parse_document
        ------------------------
        extract list of Document objects from token list
        :returns: list of document entities generated by generate_document()
    """
    documents = []
    # generate well-formatted document set for each file
    for file in os.listdir('DATA2'):
        # open 'reut2-XXX.sgm' file from /data directory
        data = open(os.path.join(os.getcwd(), "DATA2", file), 'r')
        print("file",file)
        text = data.read()
        data.close()
        tree = generate_tree(text.lower())
        # separate segments & generate documents
        for reuter in tree.find_all("reuters"):
            document = generate_document(reuter)
            documents.append(document)
        print("Finished extracting information from file:", file)
    return documents

###############################################################################
##################### function(s) for generating lexicon ######################
###############################################################################

def generate_lexicon(documents):
    """ function: generate_lexicon
        --------------------------
        generate title/body lexicon for feature selection
        :param documents: objects to generate lexicon information
        :returns: dictionary of sets for title & body lexicons
    """
    lexicon = { 'title' : set(), 'body' : set() }
    for document in documents:
        for term in document['words']['title']:
            lexicon['title'].add(term)
        for term in document['words']['body']:
            lexicon['body'].add(term)
    return lexicon

###############################################################################
################## main function - single point of execution ##################
###############################################################################

def main(argv):
    """ function: main
        --------------
        sanitize input files into well-formatted, processable objects
        generate dataset (feature vectors, class labels) for .sgm file set:
        :param argv: command line arguments - no purpose at the moment
    """
    # generate list of document objects for feature selection
    print('Generating document objects. This may take some time...')
    #documents = parse_documents()

if __name__ == "__main__":
    main(sys.argv[1:])


Generating document objects. This may take some time...


**Calling parse_documents function and storing it in documents object**

In [3]:
documents = parse_documents()

file reut2-004.sgm
Finished extracting information from file: reut2-004.sgm
file reut2-010.sgm
Finished extracting information from file: reut2-010.sgm
file reut2-011.sgm
Finished extracting information from file: reut2-011.sgm
file reut2-005.sgm
Finished extracting information from file: reut2-005.sgm
file reut2-013.sgm
Finished extracting information from file: reut2-013.sgm
file reut2-007.sgm
Finished extracting information from file: reut2-007.sgm
file reut2-006.sgm
Finished extracting information from file: reut2-006.sgm
file reut2-012.sgm
Finished extracting information from file: reut2-012.sgm
file reut2-016.sgm
Finished extracting information from file: reut2-016.sgm
file reut2-002.sgm
Finished extracting information from file: reut2-002.sgm
file reut2-003.sgm
Finished extracting information from file: reut2-003.sgm
file reut2-001.sgm
Finished extracting information from file: reut2-001.sgm
file reut2-015.sgm
Finished extracting information from file: reut2-015.sgm
file reut2-0

In [4]:
documents

[{'topics': [],
  'places': ['canada'],
  'title': ['major', 'impact', 'remov'],
  'body': ['said',
   'expect',
   'earlier',
   'report',
   'remov',
   'jone',
   'industri',
   'index',
   'make',
   'major',
   'impact',
   'compani',
   'stock',
   'think',
   'individu',
   'institut',
   'share',
   'jone',
   'spokesman',
   'said',
   'repli',
   'queri',
   'close',
   'lower',
   'second',
   'activ',
   'trade',
   'toronto',
   'stock',
   'exchang',
   'wall',
   'street',
   'journal',
   'select',
   'index',
   'said',
   'drop',
   'make',
   'index',
   'repres',
   'market',
   'world',
   'largest',
   'nickel',
   'produc',
   'member',
   'index',
   'replac',
   'effect',
   'tomorrow',
   'nickel',
   'analyst',
   'marten',
   'cochran',
   'murray',
   'said',
   'remov',
   'index',
   'like',
   'spark',
   'sell',
   'pressur',
   'stock',
   'investor',
   'suddenli',
   'well',
   'stock',
   'elimin',
   'invest',
   'said',
   'marten',
   'move',
   

In [5]:
df_docs = pd.DataFrame.from_dict(documents)

In [11]:
import re
def split_it(df_docs):
    for col in df_docs.columns:
        df_docs[col] = df_docs[col].apply(str)
        df_docs[col] =  df_docs[col].apply(lambda x: x.replace('[','').replace(']','').replace('\'','')) 
    return df_docs

df_filter = split_it(df_docs)
df_filter

Unnamed: 0,body,places,title,topics
0,"said, expect, earlier, report, remov, jone, in...",canada,"major, impact, remov",
1,"mason, former, presid, chief, oper, offic, emp...",usa,"former, empir, carolina, exec, sentenc",
2,"discoveri, complic, search, vaccin, team, armi...",usa,"doctor, find, link, smallpox, viru",
3,"doctor, center, diseas, control, atlanta, said...",usa,"birth, control, pill, help, prevent, cancer, s...",
4,"econom, data, week, determin, interest, rate, ...",usa,"econom, data, debt, futur, outlook","interest, retail, ipi"
5,"reagan, administr, respond, last, year, unit, ...",usa,"action, program, africa",
6,"novel, type, financ, texa, instrument, market,...",usa,"unusu, texa, instrument, prefer, price",
7,"canadian, foreign, secretari, clark, wind, vis...","usa, canada","clark, expect, action, acid, rain",
8,"ford, motor, said, check, distribut, employe, ...",usa,"ford, motor, distribut, profit, share",
9,"jamaica, billion, foreign, debt, reduc, oblig,...",jamaica,"jamaica, borrow",


**Storing the dataframe in CSV**

In [12]:
df_docs.to_csv('Documents_Complete.csv', index = False)