In [1]:
import glob
import numpy as np
import nltk
import re
from collections import defaultdict

In [8]:
train_dir = '20news-bydate-train'
test_dir = '20news-bydate-test'

In [9]:
list_train_dirs = []
list_test_dirs = []

for file_dir in glob.glob(train_dir+'/*'):
    list_train_dirs.append(file_dir)
for file_dir in glob.glob(test_dir+'/*'):
    list_test_dirs.append(file_dir)

In [46]:
def get_data(list_dirs):
    data = []
    for group_id, newsgroup_dir in enumerate(list_dirs):
        for text_dir in glob.glob(newsgroup_dir+'/*'):
            with open(text_dir) as f:
                text = f.read().lower()
                
                words = [stemmer.stem(word)
                        for word in re.split('\W+', text)
                        if word not in stop_words]
                
                content = ' '.join(words)
                assert len(content.splitlines()) == 1
                data.append(str(group_id)+'<fff>'+
                           text_dir[-5:]+'<fff>'+content)
                
    return data
                

In [11]:
# from nltk.corpus import stopwords
# from nltk.stem.porter import PorterStemmer

In [12]:
# stemmer = PorterStemmer()
# stop_words = stopwords.words('english')

In [48]:
train_data = get_data(list_train_dirs)
test_data = get_data(list_test_dirs)

In [50]:
full_data = train_data + test_data

with open('data_processed/20news-train-processed.txt', 'w') as f:
    f.write('\n'.join(train_data))
    
with open('data_processed/20news-test-processed.txt', 'w') as f:
    f.write('\n'.join(test_data))
    
with open('data_processed/20news-full-processed.txt', 'w') as f:
    f.write('\n'.join(full_data))

In [51]:
train_data = []
test_data = []
full_data = []

with open('data_processed/20news-train-processed.txt') as f:
    train_data = f.readlines()
    
with open('data_processed/20news-test-processed.txt') as f:
    test_data = f.readlines()
    
with open('data_processed/20news-full-processed.txt') as f:
    full_data = f.readlines()

In [38]:
def generate_vocabulary(data_path):
    def compute_idf(df, corpus_size):
        assert df > 0
        return np.log10(corpus_size * 1./df)
    
    with open(data_path) as f:
        lines = f.read().splitlines()
    doc_count = defaultdict(int)
    corpus_size = len(lines)
    
    for line in lines:
        features = line.split('<fff>')
        text = features[-1]
        words = list(set(text.split()))
        for word in words:
            doc_count[word] += 1
            
    words_idfs = [(word, compute_idf(document_freq, corpus_size))
            for word, document_freq in zip(doc_count.keys(), doc_count.values())
            if document_freq > 10 and not word.isdigit()]

    words_idfs.sort(key=lambda idf: -idf[1])
    print('Vocabulary size:', len(words_idfs))

    with open('data_processed/words_idfs.txt', 'w') as f:
        f.write('\n'.join([word + '<fff>'+str(idf) for word, idf in words_idfs]))

In [54]:
data_path = 'data_processed/20news-full-processed.txt'
generate_vocabulary(data_path)

Vocabulary size: 14212


In [70]:
def get_tf_idf(data_path):
    with open('data_processed/words_idfs.txt') as f:
        words_idfs = [(line.split('<fff>')[0], float(line.split('<fff>')[1]))
                     for line in f.read().splitlines()]
        
        word_IDs = dict([(word, index)
                        for index, (word, idf) in enumerate(words_idfs)])
        idfs = dict(words_idfs)
        
    with open(data_path) as f:
        documents = [
            (int(line.split('<fff>')[0]),
            line.split('<fff>')[1],
            line.split('<fff>')[2])
            for line in f.read().splitlines()]

    data_tf_idf = []
    for document in documents:
        label, doc_id, text = document
        words = [word for word in text.split() if word in idfs]
        word_set = list(set(words))
        max_term_freq = max([words.count(word) for word in word_set])
        words_tfidfs = []
        sum_squares = 0.0
        for word in word_set:
            term_freq = words.count(word)
            tf_idf_value = term_freq * 1. / max_term_freq * idfs[word]
            words_tfidfs.append((word_IDs[word], tf_idf_value))
            sum_squares += tf_idf_value ** 2
            
        words_tfidfs_normalized = [str(index) + ':'
                                  + str(tf_idf_value / np.sqrt(sum_squares))
                                  for index, tf_idf_value in words_tfidfs]
        
        sparse_rep = ' '.join(words_tfidfs_normalized)
        data_tf_idf.append((label, doc_id, sparse_rep))
           
    with open('data_processed/data_tf_idf.txt', 'w') as f:
        f.write('\n'.join([str(label)+'<fff>'+str(doc_id)+'<fff>'+sparse_rep for label, doc_id, sparse_rep in data_tf_idf]))   
            
            

In [68]:
get_tf_idf(data_path)

(0, '49960', 'mathew mathew manti co uk subject alt atheism faq atheist resourc summari book address music anyth relat atheism keyword faq atheism book music fiction address contact expir thu 29 apr 1993 11 57 19 gmt distribut world organ manti consult cambridg uk supersed 19930301143317 manti co uk line 290 archiv name atheism resourc alt atheism archiv name resourc last modifi 11 decemb 1992 version 1 0 atheist resourc address atheist organ usa freedom religion foundat darwin fish bumper sticker assort atheist paraphernalia avail freedom religion foundat us write ffrf p box 750 madison wi 53701 telephon 608 256 8900 evolut design evolut design sell darwin fish fish symbol like one christian stick car feet word darwin written insid delux mould 3d plastic fish 4 95 postpaid us write evolut design 7119 laurel canyon 4 north hollywood ca 91605 peopl san francisco bay area get darwin fish lynn gold tri mail figmo netcom com net peopl go lynn directli price 4 95 per fish american atheist p