In [1]:
from glob import glob

In [2]:
glob("./dataset/*")

['./dataset/6',
 './dataset/1',
 './dataset/4',
 './dataset/3',
 './dataset/2',
 './dataset/5']

In [3]:
glob("./dataset/1/Test/*")

['./dataset/1/Test/38215',
 './dataset/1/Test/38224',
 './dataset/1/Test/38223',
 './dataset/1/Test/38222',
 './dataset/1/Test/38225',
 './dataset/1/Test/38214',
 './dataset/1/Test/38216',
 './dataset/1/Test/38220',
 './dataset/1/Test/38227',
 './dataset/1/Test/38218',
 './dataset/1/Test/38226',
 './dataset/1/Test/38219',
 './dataset/1/Test/38221',
 './dataset/1/Test/38217',
 './dataset/1/Test/38228']

In [29]:
from nltk import SnowballStemmer, PorterStemmer

def extract_words(dir, stemmer=None):
    with open(dir, errors="ignore") as f:
        words = []
        for l in f.readlines():
            l = l.replace("\n", "")
            if l == "":
                continue
            splitted = l.split(" ")
            splitted = [s for s in splitted if s != ""]
            if len(splitted) == 0:
                continue

            if stemmer is not None:
                splitted = [stemmer.stem(s) for s in splitted]
                
            words.extend(splitted)

    return words

sb_stemmer = SnowballStemmer("english")
pt_stemmer = PorterStemmer()
words = extract_words("./dataset/2/Train/74770")
len(words), words[:10]

(244,
 ['Path:',
  'cantaloupe.srv.cs.cmu.edu!rochester!news.crd.ge.com!rpi!uwm.edu!caen!malgudi.oar.net!ucbeh.san.uc.edu!ucunix.san.uc.edu!uceng.uc.edu!cccbbs!sam.halperin',
  'Newsgroups:',
  'misc.forsale',
  'Subject:',
  'Price',
  'quote',
  'wanted',
  'Message-ID:',
  '<30431.93.uupcb@cccbbs.UUCP>'])

In [30]:
ws1 = extract_words("./dataset/2/Train/74770", sb_stemmer)
ws2 = extract_words("./dataset/2/Train/74770", pt_stemmer)
for w1, w2 in zip(ws1, ws2):
    if w1 != w2:
        print(w1, w2)

bus bu
70ns 70n
bus bu
dos do
this thi
high highli


In [54]:
from pathlib import Path

def load_train(base_dir, stemmer=None):
    """
    load train files and returns `class_to_words` = {
        # class 
        1: {
            "random": 10, # word: number of the occurences
            ...
        },
        ...
    }
    """
    class_to_words = {}
    for i in range(1, 7):
        dir = base_dir.joinpath(f"{i}/Train/*")
        file_paths = glob(str(dir))
        words = set()
        for p in file_paths:
            words.update(extract_words(p, stemmer))

        class_to_words[i] = words
            
        break

    return class_to_words

base_dir = Path("./dataset")
c2w = load_train(base_dir, sb_stemmer)

In [55]:
c2w

{1: {'liberti',
  'graeme>',
  'crt',
  'yesterday...',
  'enter',
  'complaint',
  'pure',
  'language",',
  'bandwidths,',
  '<1993apr14.102007.20664@uk03.bull.co.uk>',
  'silver)',
  '**\t\t\t\t\t**',
  'grafsi',
  'experi',
  '**************************************************************************',
  '%%creator:',
  'staff',
  'similar',
  '<c4ucos.8rs@cvtstu.cvt.stuba.cs>',
  'years,',
  'accompani',
  'big',
  'kass)',
  'respond',
  'out',
  '22',
  'petro',
  'less',
  '(v1',
  'recommend',
  'sorry!',
  "i'll",
  'turbo',
  'non-commerci',
  'applic',
  'becom',
  '(scrowe@hemel.bull.co.uk)',
  'previous',
  '(prefer',
  'news@julian.uwo.ca',
  '<1993apr14.044946.12144@labtam.labtam.oz.au>',
  'whether',
  'park,ca)',
  '**\ttel:\t(415)',
  'ocr104.zip',
  'magic',
  'to',
  'trb3@ra.msstate.edu',
  '\twell....i',
  'far',
  '<1qi19q$6t2@acsc.com>',
  'pov??',
  'spec',
  'client/serv',
  '(kenneth',
  'coeffici',
  'management/interact',
  '+49-221-20189-17',
  'outlined.

In [56]:
len(c2w[1])

5717