In [24]:
from nltk import FreqDist
import numpy as np

For exp1, we see each line as a sequence of digits, not numbers. Let's take a look at how long the sequences are.

In [25]:
train_lines = [line.split('"')[1] for line in open('data/train.csv', 'r').read().splitlines()[1:]]
test_lines = [line.split('"')[1] for line in open('data/test.csv', 'r').read().splitlines()[1:]]
lines = train_lines + test_lines

In [28]:
seqlens = np.array([len(line) for line in lines])

print("The longest sequence is {} in length.".format(np.max(seqlens)))
print("The shortest sequence is {} in length.".format(np.min(seqlens)))
print("The average length of sequences is {}".format(np.mean(seqlens)))
print("The proportion of sequences over 400 in length is {}".format(len(seqlens[seqlens > 400]) / len(seqlens)))
print("The proportion of sequences over 300 in length is {}".format(len(seqlens[seqlens > 300]) / len(seqlens)))
print("The proportion of sequences over 200 in length is {}".format(len(seqlens[seqlens > 200]) / len(seqlens)))
print("The proportion of sequences over 100 in length is {}".format(len(seqlens[seqlens > 100]) / len(seqlens)))

The longest sequence is 1281 in length.
The shortest sequence is 1 in length.
The average length of sequences is 173.7825727963459
The proportion of sequences over 400 in length is 0.0010408889279283236
The proportion of sequences over 300 in length is 0.005243972067284466
The proportion of sequences over 200 in length is 0.36055162721243794
The proportion of sequences over 100 in length is 0.866739865606746


Most sequences are less than 300 in length. So we can exclude sequences more than 300 in length in training.

For exp2, we see each line as a sequence of numbers.

In [27]:
# Vectorize
train_lines = [line.split('"')[1] for line in open('data/train.csv', 'r').read().splitlines()[1:]]
test_lines = [line.split('"')[1] for line in open('data/test.csv', 'r').read().splitlines()[1:]]
maxlen = 0  
nums_list = []
for j, line in enumerate(train_lines + test_lines):
    nums = line.split(",") 
    nums_list.extend(nums)
    maxlen = max(maxlen, len(nums))
    
print("The longest sequence is {} in length".format(maxlen))
fdist = FreqDist(nums_list)
print(fdist.most_common(1000))

The longest sequence is 348 in length
[('1', 797831), ('0', 586087), ('2', 438844), ('3', 324774), ('4', 278593), ('5', 232865), ('6', 213603), ('7', 190517), ('8', 183508), ('9', 165411), ('10', 73286), ('11', 67390), ('12', 66747), ('13', 56308), ('16', 54613), ('15', 48195), ('14', 45513), ('-1', 45045), ('17', 42263), ('18', 39328), ('19', 38540), ('20', 38378), ('24', 37343), ('21', 36864), ('23', 32330), ('22', 30358), ('25', 30211), ('30', 30021), ('36', 29108), ('32', 28670), ('31', 28588), ('28', 28434), ('29', 27857), ('27', 27158), ('26', 25598), ('37', 24112), ('35', 22620), ('40', 22352), ('41', 22128), ('42', 22019), ('33', 21935), ('34', 21690), ('48', 21226), ('43', 20609), ('64', 20182), ('45', 19488), ('60', 19222), ('47', 18993), ('56', 18663), ('49', 18183), ('44', 18032), ('39', 17986), ('53', 17830), ('38', 17821), ('55', 17714), ('50', 16891), ('61', 16868), ('46', 16575), ('54', 16548), ('59', 16110), ('52', 15890), ('63', 15148), ('51', 15128), ('72', 15092), (

The top 1000 numbers seem pretty good w.r.t. frequency. Accordingly, we will set the number of vocabulary to 1000. Instead of finding the most frequent numbers, we simply include [0, 1000).