In [123]:
import numpy as np
import pandas as pd
from urllib.request import urlopen
import gzip
import os

In [124]:
docword_file_base_name = 'docword.{}.txt.gz'
unzipped_docword_file_base_name = 'docword.{}.txt'
vocab_file_base_name = 'vocab.{}.txt'

def check_file_existence(path):
    found = os.path.isfile(path)
    
    if not found:
        print("File {} not found.".format(path))
    return found

def get_vocabulary_data_set(requested_dataset, \
               base_url='https://archive.ics.uci.edu/ml/machine-learning-databases/bag-of-words/'):
    
    for data_set in requested_dataset:
        
        # Vocabulary file.
        with urlopen(base_url + vocab_file_base_name.format(data_set)) as response:
            data = response.read()
            encoding = response.headers.get_content_charset('utf-8')
            text = data.decode(encoding).splitlines() 

    # Load the vocabulary into the dataframe.
    df = pd.DataFrame(data=text, columns=['word'])

    # Shift the index by one.
    df.index += 1

    # Rename the index column.
    df.index.name='word_id'
    
    return df    

In [125]:
# Check if the docword exists in the specific folder.

folder_name = 'docword_files/'
file_name = unzipped_docword_file_base_name.format('enron')
path = folder_name+file_name
check_file_existence(path)

True

In [126]:
requested_dataset = ['enron']

# Get the dataset.
df_vocab = get_vocabulary_data_set(requested_dataset)

In [127]:
df_vocab.head()

Unnamed: 0_level_0,word
word_id,Unnamed: 1_level_1
1,aaa
2,aaas
3,aactive
4,aadvantage
5,aaker


In [128]:
df_vocab.size

28102

In [129]:
def get_docword_data_set(data_set_name):
    folder_name = 'docword_files/'
    file_name = unzipped_docword_file_base_name.format(data_set_name)
    path = folder_name+file_name
    index = 0
    
    info = dict()
    
    if check_file_existence(path):
        print('File found')
        with open(path, 'rb') as file:
            for line in file:
                line = line.decode('utf-8').splitlines()[0]
                if index == 0:
                    info['docs_num'] = line
                elif index == 1:
                    info['words_num'] = line
                elif index == 2:
                    info['non_zero_num'] = line
                else:
                    break
                index += 1
    
    # Now load the rest of the file.
    df = pd.read_csv(path, sep=" ", header=None, skiprows=[0,1,2])
    df.columns = ["document_id", "word_id", "count"]
            
    return info, df

In [130]:
info,df_docword = get_docword_data_set('enron')

File found


In [131]:
info

{'docs_num': '39861', 'words_num': '28102', 'non_zero_num': '3710420'}

In [153]:
df_docword['count'].size # // 3710420

3710420

In [115]:
df_vocab.head()

Unnamed: 0_level_0,word
word_id,Unnamed: 1_level_1
1,aaa
2,aaas
3,aactive
4,aadvantage
5,aaker


In [116]:
merged = pd.merge(df_vocab, df_docword, left_index=True, right_on='word_id')

In [117]:
merged

Unnamed: 0,word,document_id,word_id,count
95356,aaa,1684,1,1
142343,aaa,2093,1,1
147317,aaa,2182,1,1
152649,aaa,2281,1,1
173685,aaa,2609,1,1
263217,aaa,4225,1,1
387270,aaa,5257,1,1
387511,aaa,5263,1,1
387851,aaa,5267,1,1
433995,aaa,5900,1,1


In [162]:
merged[merged['word_id'] == 1996].count()

word           10
document_id    10
word_id        10
count          10
dtype: int64

In [161]:
df_docword[df_docword['word_id'] == 1996].count()

document_id    10
word_id        10
count          10
dtype: int64

In [154]:
# merged[merged['word_id'] == 28102]
merged['count'].size

3710420

In [155]:
# df_docword[df_docword['word_id'] == 28102]
df_docword['count'].size

3710420

In [157]:
merged['count'].size - df_docword['count'].size

0