# HW#2

In [1]:
import os
import re

# converts a txt file to the form of [string, int][]
def clean(path):
    res = []
    
    with open(path) as f:
        lines = f.readlines()
        
        for line in lines:
            words = line.split()
            # remove all leading and trailing nonalphabetic characters in each word and transform into lowercase
            # ex: -I'm, -> i'm
            sentence = ' '.join([re.sub(r'^[^a-zA-Z]+|[^a-zA-Z]+$','' , word).lower() for word in words[:-1]])
            label = int(words[-1])
            res.append([sentence, label])

    return res

# gives each word an unique number (starting from 0)
def encode(words):
    res = {}

    for word in words:
        if word not in res:
            res[word] = len(res)
            
    return res


def construct_matrix(sentences, word2num):
    M = len(sentences)
    N = len(word2num)
    res = [[0] * N for i in range(M)]

    for i, sentence in enumerate(sentences):
        for word in sentence.split():
            res[i][word2num[word]] += 1

    return res

In [2]:
# get all txt files under ./data except for readme.txt
FILES = [os.path.join('./data', f) for f in os.listdir('./data') if f.endswith('.txt') and f != 'readme.txt']
data = []

for file in FILES:
    data += clean(file)

print(len(data))
print(data[0])

3000
['a very very very slow-moving aimless movie about a distressed drifting young man', 0]


In [3]:
words = []

for x in data:
    for word in x[0].split():
        words.append(word)

word2num = encode(words)

print(len(word2num))
print([(word, num) for word, num in word2num.items() if num < 5])

5339
[('a', 0), ('very', 1), ('slow-moving', 2), ('aimless', 3), ('movie', 4)]


In [4]:
D = construct_matrix([x[0] for x in data], word2num)

print(len(D), len(D[0]))
print(D[0][:20])

3000 5339
[2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


# HW#3

In [34]:
import numpy as np

col_sums = np.sum(np.array(D), axis=0)
print(f'First 10 columns sums: {col_sums[:10]}')

First 10 columns sums: [886 243   1   1 177  85   1   1   4  14]


In [35]:
import nltk

nltk.download('stopwords')
STOP_WORDS = set(nltk.corpus.stopwords.words('english'))
print(f'10 stop words: {list(STOP_WORDS)[:10]}')

10 stop words: ['which', 'having', 'while', "you'll", 'is', 'i', 'are', 'will', 'than', "won't"]


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/kennycartman/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [44]:
words_sorted_by_freq = np.argsort(col_sums)[::-1] # descending order
print(f'Top 10 most frequent words (in number): {words_sorted_by_freq[:10]}') # number form
print(f'Top 10 most frequent words: [num2word[w] for w in words_sorted_by_freq[:10]]"') # word form

Top 10 most frequent words (in number): [ 16  32  77   0  57  50  75 117  23  13]
Top 10 most frequent words: [num2word[w] for w in words_sorted_by_freq[:10]]"


In [40]:
num2word = {num: word for word, num in word2num.items()}
print(f'num2word\'s size = {len(num2word)}')
print(f'First 5 entries in num2word: {[(num, word) for num, word in num2word.items() if num < 5]}')

num2word's size = 5339
First 5 entries in num2word: [(0, 'a'), (1, 'very'), (2, 'slow-moving'), (3, 'aimless'), (4, 'movie')]


In [46]:
filtered_words_sorted_by_freq = [int(w) for w in words_sorted_by_freq if num2word[int(w)] not in STOP_WORDS] # remove stop words
print(f'Number of words (stop words removed) = {len(filtered_words_sorted_by_freq)}')
print(f'Top 10 most frequent words (in number, stop words removed): {filtered_words_sorted_by_freq[:10]}') # number form
print(f'Top 10 most frequent words (stop words removed): {[num2word[w] for w in filtered_words_sorted_by_freq[:10]]}') # word form

Number of words (stop words removed) = 5195
Top 10 most frequent words (in number, stop words removed): [87, 455, 4, 4412, 147, 231, 3181, 314, 1600, 250]
Top 10 most frequent words (stop words removed): ['good', 'great', 'movie', 'phone', 'film', 'one', 'food', 'like', 'place', 'time']


In [47]:
K = 1000
features = filtered_words_sorted_by_freq[:K]
print(f'Show first 10 feature: {features[:10]}')
features.sort()
print(f'Show first 10 feature after sorting: {features[:10]}')

Show first 10 feature: [87, 455, 4, 4412, 147, 231, 3181, 314, 1600, 250]
Show first 10 feature after sorting: [4, 8, 9, 11, 15, 17, 18, 20, 21, 22]


In [48]:
D_reduced = np.array(D)[:, features]
print(f'Number of rows (after feature selection) = {len(D_reduced)}')
print(f'Number of columns (after feature selection) = {len(D_reduced[0])}')
print(f'First 10 columns sums of D_reduced: {np.sum(np.array(D_reduced), axis=0)[:10]}')
print(f'First 10 entries of the first row of D_reduced: {D_reduced[0][:10]}')

Number of rows (after feature selection) = 3000
Number of columns (after feature selection) = 1000
First 10 columns sums of D_reduced: [177   4  14  16   9   4  35   5   4   9]
First 10 entries of the first row of D_reduced: [1 1 1 0 0 0 0 0 0 0]


In [49]:
labels = [pair[1] for pair in data]
print(f'Size of labels = {len(labels)}')
print(f'First 10 labels: {labels[:10]}')

Size of labels = 3000
First 10 labels: [0, 0, 0, 0, 1, 0, 0, 1, 0, 1]


In [50]:
import math

# 70% - 10% - 20% split
p = math.floor(0.7 * len(D))
q = math.floor(0.1 * len(D))
x_train = D[:p] 
y_train = labels[:p]
x_val = D[p: p + q]
y_val = labels[p: p + q]
x_test = D[p + q:]
y_test = labels[p + q:]
print(f'Size of x_train = {len(x_train)}')
print(f'Size of y_train = {len(y_train)}')
print(f'Size of x_val = {len(x_val)}')
print(f'Size of y_val = {len(y_val)}')
print(f'Size of x_test = {len(x_test)}')
print(f'Size of y_test = {len(y_test)}')

Size of x_train = 2100
Size of y_train = 2100
Size of x_val = 300
Size of y_val = 300
Size of x_test = 600
Size of y_test = 600


In [52]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

tree = DecisionTreeClassifier()
tree = tree.fit(x_train, y_train)
y_pred = tree.predict(x_val)
accuracy = accuracy_score(y_val, y_pred)
print(f'[Classifier 1: Decision Tree Classifier] Accuracy on validation dataset = {round(accuracy * 100, 2)}%')

[Classifier 1: Decision Tree Classifier] Accuracy on validation dataset = 73.67%


In [None]:
y_pred = tree.predict(x_val)
accuracy = accuracy_score(y_val, y_pred)
print(f'[Classifier 1: Decision Tree Classifier] Accuracy on validation dataset = {round(accuracy * 100, 2)}%')