In [1]:
import os
import re

# converts a txt file to the form of [string, int][]
def clean(path):
    res = []
    
    with open(path) as f:
        lines = f.readlines()
        
        for line in lines:
            words = line.split()
            # remove all leading and trailing nonalphabetic characters in each word and transform into lowercase
            # ex: -I'm, -> i'm
            sentence = ' '.join([re.sub(r'^[^a-zA-Z]+|[^a-zA-Z]+$','' , word).lower() for word in words[:-1]])
            label = int(words[-1])
            res.append([sentence, label])

    return res

# gives each word an unique number (starting from 0)
def encode(words):
    res = {}

    for word in words:
        if word not in res:
            res[word] = len(res)
            
    return res


def construct_matrix(sentences, word2num):
    M = len(sentences)
    N = len(word2num)
    res = [[0] * N for i in range(M)]

    for i, sentence in enumerate(sentences):
        for word in sentence.split(' '):
            res[i][word2num[word]] += 1

    return res

In [2]:
# get all txt files under ./data except for readme.txt
FILES = [os.path.join('./data', f) for f in os.listdir('./data') if f.endswith('.txt') and f != 'readme.txt']
data = []

for file in FILES:
    data += clean(file)

print(len(data))
print(data[0])

3000
['a very very very slow-moving aimless movie about a distressed drifting young man', 0]


In [3]:
words = []

for x in data:
    for word in x[0].split(' '):
        words.append(word)

word2num = encode(words)

print(len(word2num))
print([(word, num) for word, num in word2num.items() if num < 5])

5340
[('a', 0), ('very', 1), ('slow-moving', 2), ('aimless', 3), ('movie', 4)]


In [4]:
D = construct_matrix([x[0] for x in data], word2num)

print(len(D), len(D[0]))
print(D[0][:20])

3000 5340
[2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
