In [1]:
import os
import re

# converts a txt file to the form of [string, int][]
def clean(path):
    res = []
    
    with open(path) as f:
        lines = f.readlines()
        
        for line in lines:
            words = line.split()
            # remove all leading and trailing nonalphabetic characters in each word and transform into lowercase
            # ex: -I'm, -> i'm
            sentence = ' '.join([re.sub(r'^[^a-zA-Z]+|[^a-zA-Z]+$','' , word).lower() for word in words[:-1]])
            label = int(words[-1])
            res.append([sentence, label])

    return res

# gives each word an unique number (starting from 0)
def encode(words):
    res = {}

    for word in words:
        if word not in res:
            res[word] = len(res)
            
    return res


def construct_matrix(sentences, word2num):
    M = len(sentences)
    N = len(word2num)
    res = [[0] * N for i in range(M)]

    for i, sentence in enumerate(sentences):
        for word in sentence.split():
            res[i][word2num[word]] += 1

    return res

In [2]:
# get all txt files under ./data except for readme.txt
FILES = [os.path.join('./data', f) for f in os.listdir('./data') if f.endswith('.txt') and f != 'readme.txt']
data = []

for file in FILES:
    data += clean(file)

print(len(data))
print(data[0])

3000
['a very very very slow-moving aimless movie about a distressed drifting young man', 0]


In [3]:
words = []

for x in data:
    for word in x[0].split():
        words.append(word)

word2num = encode(words)

print(len(word2num))
print([(word, num) for word, num in word2num.items() if num < 5])

5339
[('a', 0), ('very', 1), ('slow-moving', 2), ('aimless', 3), ('movie', 4)]


In [4]:
D = construct_matrix([x[0] for x in data], word2num)

print(len(D), len(D[0]))
print(D[0][:20])

3000 5339
[2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [5]:
# HW3 starts here

import numpy as np

col_sums = np.sum(np.array(D), axis=0)
print(col_sums[:10])

[886 243   1   1 177  85   1   1   4  14]


In [6]:
import nltk

nltk.download('stopwords')
STOP_WORDS = set(nltk.corpus.stopwords.words('english'))
print(list(STOP_WORDS)[:10])

['off', 'before', 'd', 'then', "it's", 'all', 'into', "you'll", 'them', 'once']


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/kennycartman/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
words_sorted_by_freq = np.argsort(col_sums)[::-1] # descending order
print(words_sorted_by_freq[:10])

[ 16  32  77   0  57  50  75 117  23  13]


In [8]:
num2word = {num: word for word, num in word2num.items()}
print(len(num2word))
print([(num, word) for num, word in num2word.items() if num < 10])

5339
[(0, 'a'), (1, 'very'), (2, 'slow-moving'), (3, 'aimless'), (4, 'movie'), (5, 'about'), (6, 'distressed'), (7, 'drifting'), (8, 'young'), (9, 'man')]


In [9]:
filtered_words_sorted_by_freq = [int(w) for w in words_sorted_by_freq if num2word[int(w)] not in STOP_WORDS] # remove stop words
print(len(filtered_words_sorted_by_freq))
print(filtered_words_sorted_by_freq[:10]) # number form
print([num2word[w] for w in filtered_words_sorted_by_freq[:10]]) # word form

5195
[87, 455, 4, 4412, 147, 231, 3181, 314, 1600, 250]
['good', 'great', 'movie', 'phone', 'film', 'one', 'food', 'like', 'place', 'time']


In [39]:
K = 1000
features = filtered_words_sorted_by_freq[:K]
print(features[:10])
features.sort()
print(features[:10])

[87, 455, 4, 4412, 147, 231, 3181, 314, 1600, 250]
[4, 8, 9, 11, 15, 17, 18, 20, 21, 22]


In [40]:
D_reduced = np.array(D)[:, features]
print(len(D_reduced), len(D_reduced[0]))
print(np.sum(np.array(D_reduced), axis=0)[:10])
print(D_reduced[0][:10])

3000 1000
[177   4  14  16   9   4  35   5   4   9]
[1 1 1 0 0 0 0 0 0 0]
