In [1]:
import numpy as np
import os
import os.path
from collections import defaultdict

In [2]:
path = r'..\datasets\20news-bydate'

def gather_20newgroup_datas():
    dirs = [path + '\\' + dir_name + '\\' for dir_name in os.listdir(path) if not os.path.isfile(path + dir_name)]
    train_dir, test_dir = (dirs[0], dirs[1]) if 'train' in dirs[0] else (dirs[1], dirs[0])
    newsgroups_list = [newgroup for newgroup in os.listdir(train_dir)]
    newsgroups_list.sort()

    return train_dir, test_dir, newsgroups_list

train_dir, test_dir, newsgroups_list = gather_20newgroup_datas()

In [3]:
with open(path + '\\stop_words.txt', 'r') as f:
    stop_words = f.read().split('\n')
    f.close()

In [4]:
from nltk.stem.porter import PorterStemmer
import re

stemmer = PorterStemmer()

def collect_data_from(parent_dir, newsgroups_list):
    data = []
    for group_id, newsgroup in enumerate(newsgroups_list):
        label = group_id
        dir_path = parent_dir + newsgroup + '\\'
        files = [(filename, dir_path + filename) for filename in os.listdir(dir_path) if os.path.isfile(dir_path + filename)]
        files.sort()

        for filename, filepath in files:
            with open(filepath, 'r') as f:
                text = f.read().lower()
                words = [stemmer.stem(word) for word in re.split('\W+', text) if word not in stop_words]

                content = ' '.join(words)
                assert len(content.splitlines()) == 1
                data.append(str(label) + '<fff>' + filename + '<fff>' + content)
        
    return data


train_data = collect_data_from(train_dir, newsgroups_list)
test_data = collect_data_from(test_dir, newsgroups_list)

In [5]:
print(train_data[:1])

['0<fff>49960<fff>mathew mathew manti uk subject alt atheism faq atheist resourc summari book address music atheism keyword faq atheism book music fiction address contact expir thu 29 apr 1993 11 57 19 gmt distribut organ manti consult cambridg uk supersed 19930301143317 manti uk line 290 archiv atheism resourc alt atheism archiv resourc modifi 11 decemb 1992 version 1 0 atheist resourc address atheist organ usa freedom religion foundat darwin fish bumper sticker assort atheist paraphernalia freedom religion foundat write ffrf box 750 madison wi 53701 telephon 608 256 8900 evolut design evolut design sell darwin fish fish symbol christian stick car feet word darwin written insid delux mould 3d plastic fish 4 95 postpaid write evolut design 7119 laurel canyon 4 north hollywood 91605 peopl san francisco bay area darwin fish lynn gold mail figmo netcom net peopl lynn directli price 4 95 fish american atheist press aap publish atheist book critiqu bibl list biblic contradict book bibl hand

In [6]:
print(len(train_data))
print(len(test_data))

11314
7532


In [7]:
full_data = train_data + test_data
with open(path + '\\20news-train-processed.txt', 'w') as f:
    f.write('\n'.join(train_data))
with open(path + '\\20news-test-processed.txt', 'w') as f:
    f.write('\n'.join(test_data))
with open(path + '\\20news-full-processed.txt', 'w') as f:
    f.write('\n'.join(full_data))

In [8]:
def generate_vocabulary(data_path):
    def compute_idf(df, corpus_size):
        assert df > 0
        return np.log10(corpus_size * 1. / df)

    with open(data_path) as f:
        lines = f.read().splitlines()
    doc_count = defaultdict(int)
    corpus_size = len(lines)

    for line in lines:
        features = line.split('<fff>')
        text = features[-1]
        words = list(set(text.split()))
        for word in words:
            doc_count[word] += 1

    words_idfs = [(word, compute_idf(doc_freq, corpus_size)) for word, doc_freq in zip(doc_count.keys(), doc_count.values()) if doc_freq > 10 and not word.isdigit()]
    words_idfs.sort(key = lambda word_idf: -word_idf[-1])
    print('Vocabulary size: ', len(words_idfs))
    
    #create a dictionary
    with open(path + '\\words_idfs.txt', 'w') as f:
        f.write('\n'.join([word + '<fff>' + str(idf) for word, idf in words_idfs]))


In [9]:
generate_vocabulary(data_path = path + '\\20news-full-processed.txt')

Vocabulary size:  13973


In [10]:
def get_tf_idf(data_path, dataset):
    with open(path + '\\words_idfs.txt') as f:
        words_idfs = [(line.split('<fff>')[0], float(line.split('<fff>')[1])) for line in f.read().splitlines()]
        words_ids = dict([(word, index) for index, (word, _) in enumerate(words_idfs)])
        idfs = dict(words_idfs)
    
    with open(data_path) as f:
        documents = [(int(line.split('<fff>')[0]), int(line.split('<fff>')[1]), line.split('<fff>')[2]) for line in f.read().splitlines()]
    
    data_tf_idf = []
    for document in documents:
        label, doc_id, text = document
        words = [word for word in text.split() if word in idfs]
        words_set = list(set(words))
        max_term_freq = max([words.count(word) for word in words_set])
        words_tfidfs = []
        sum_squares = 0.0
        for word in words_set:
            term_freq = words.count(word)
            tf_idf_value = term_freq * 1. / max_term_freq * idfs[word]
            words_tfidfs.append((words_ids[word], tf_idf_value))
            sum_squares += tf_idf_value ** 2
        
        words_tfidfs_normalized = [str(index) + ':' + str(tf_idf_value / np.sqrt(sum_squares)) for index, tf_idf_value in words_tfidfs]

        sparse_sep = ' '.join(words_tfidfs_normalized)
        data_tf_idf.append((label, doc_id, sparse_sep))

    with open(path + '\\20news_' + dataset + '_tfidf.txt', 'w') as f:
            f.write('\n'.join([str(label) + '<fff>' + str(doc_id) + '<fff>' + sparse_sep for (label, doc_id, sparse_sep) in data_tf_idf]))

In [11]:
get_tf_idf(data_path = path + '\\20news-full-processed.txt', dataset = 'full')
get_tf_idf(data_path = path + '\\20news-train-processed.txt', dataset = 'train')
get_tf_idf(data_path = path + '\\20news-test-processed.txt', dataset = 'test')