# 20N Newsgroup

In [1]:
from xml.dom import minidom
import os, nltk, re
from gensim.parsing.porter import PorterStemmer
from gensim.parsing.preprocessing import remove_stopwords
from gensim import corpora
from gensim import models
from gensim import similarities
from smart_open import smart_open
import numpy as np

In [2]:
characters_to_remove = '!()#@~,."><*=-'
pattern = "[" + characters_to_remove + "]"
p = PorterStemmer()
tokenizer = nltk.RegexpTokenizer(r'\w+')
freq_to_remove = 1

In [3]:
def process(p, tokenizer, text):
    """ Applies standard pre-processing to given text.
    
    Args:
        p (gensim.parsing.porter.PorterStemmer): stemmer object.
        tokenizer (nltk.tokenize.regexp.RegexpTokenizer): tokenizr object.
        text (str): text to preprocess.
    
    Returns:
        list: preprocessed text.
    
    """
    # Converts to lowercase
    doc_nor = text.lower()
    
    # Removes stopwords
    doc_sw = remove_stopwords(doc_nor)
    
    # Stems text
    doc_stem = p.stem_sentence(doc_sw)
    
    # Lemmatizes text
    # TODO: Lemmatizer
    
    # Returns preprocessed text
    return tokenizer.tokenize(doc_stem)

In [4]:
categories = os.listdir('./data/20news')
category_index = {}
for i, cat in enumerate(categories):
    d = {cat: i}
    category_index.update(d)
print(category_index)

{'comp.sys.ibm.pc.hardware': 0, 'soc.religion.christian': 1, 'sci.med': 2, 'talk.politics.misc': 3, 'talk.religion.misc': 4, 'comp.os.ms-windows.misc': 5, 'sci.crypt': 6, 'alt.atheism': 7, 'sci.space': 8, 'talk.politics.guns': 9, 'talk.politics.mideast': 10, 'comp.graphics': 11, 'rec.motorcycles': 12, 'comp.windows.x': 13, 'comp.sys.mac.hardware': 14, 'rec.autos': 15, 'rec.sport.hockey': 16, 'rec.sport.baseball': 17, 'sci.electronics': 18, 'misc.forsale': 19}


In [5]:
listed_text = []
listed_categories = []
for category in categories:
    files = os.listdir('./data/20news/' + category)
    for file in files:
        doc = open('./data/20news/' + category + '/' + file, encoding = 'ISO-8859-1',mode='r')
        text = re.sub(pattern, "", doc.read().replace('\n', '').replace('  ', ''))
        listed_text.append(process(p, tokenizer, text))
        listed_categories.append(category)
        doc.close()

In [9]:
dictionary = corpora.Dictionary(listed_text)
dictionary.filter_extremes(no_below=freq_to_remove)
dictionary.save('./resources/20news/vocab20news.dict')
doc_corpus = []
for doc in listed_text:
    doc_corpus.append(dictionary.doc2bow(doc))
print('Dictionary length: ' + str(len(dictionary)))

Dictionary length: 100000


In [8]:
bool_bow = np.zeros((len(doc_corpus), len(dictionary) + 1), dtype=np.int8)
bow = np.zeros((len(doc_corpus), len(dictionary) + 1), dtype=np.int8)
for index, doc in enumerate(doc_corpus):
    bool_bow[index, -1] = category_index[listed_categories[index]]
    bow[index, -1] = category_index[listed_categories[index]]
    for item in doc:
        bool_bow[index, item[0]] = 1
        bow[index, item[0]] = item[1]
np.save('./resources/20news/bool_bow_matrix.npy', bool_bow)
np.save('./resources/20news/bow_matrix.npy', bow)