In [17]:
# Create input reader function
def input_reader(filename) -> tuple:
    X_data = []
    Y_data =[]
    # Open file system
    with open(filename, 'r') as f:
        for line in f:
            label, text = line.strip().split(sep=' ||| ')
            X_data.append(text)
            Y_data.append(int(label))    
    return (X_data, Y_data)    
        

In [18]:
# Reads in training and testing dataset
X_train, Y_train = input_reader('../data/sst-sentiment-text-threeclass/train.txt')
X_test, Y_test = input_reader('../data/sst-sentiment-text-threeclass/dev.txt')

In [19]:
# tokenize each entry
def tokenize(datum):
    # split string into words
    return datum.split(' ')

def build_feature_map(X) -> dict:
    '''
        check for emptiness; return if empty
        setup a set object for distinct words
        loop through the corpus and this takes O(n^2) - quadratic time
        return a dictionary where each word is indexed
    '''
    if len(X) == 0:
        return
    unique_words = set()
    for datum in X:
        for word in tokenize(datum):
            unique_words.add(word)        
    return {word: idx for idx, word in enumerate(unique_words)}



In [21]:
from scipy.sparse import dok_matrix

def extract_features(word_to_idx, X):
    features = dok_matrix((len(X), len(word_to_idx)))
    for i in range(len(X)):
        for word in tokenize(X[i]):
            if word in word_to_idx:
                # increase the word count if it is present in the map
                # unknown words are discarded because we would not have
                # a learned weight for them anyway.
                features[i, word_to_idx[word]] += 1

    return features

In [22]:
sample_data = [
    "When is the homework due ? The homework is hard",
    "When are the TAs' office hours ?",
    "How hard is the homework ?",
]

word_to_idx = build_feature_map(sample_data)
print(word_to_idx)
print()

features = extract_features(word_to_idx, sample_data)
print(features)

{'homework': 0, 'hours': 1, '?': 2, 'How': 3, 'hard': 4, 'are': 5, "TAs'": 6, 'office': 7, 'The': 8, 'due': 9, 'is': 10, 'When': 11, 'the': 12}

  (0, 11)	1.0
  (0, 10)	2.0
  (0, 12)	1.0
  (0, 0)	2.0
  (0, 9)	1.0
  (0, 2)	1.0
  (0, 8)	1.0
  (0, 4)	1.0
  (1, 11)	1.0
  (1, 5)	1.0
  (1, 12)	1.0
  (1, 6)	1.0
  (1, 7)	1.0
  (1, 1)	1.0
  (1, 2)	1.0
  (2, 3)	1.0
  (2, 4)	1.0
  (2, 10)	1.0
  (2, 12)	1.0
  (2, 0)	1.0
  (2, 2)	1.0


Now let's run the feature extractor on the actual data

In [23]:
# Build the map based on the training data
word_to_idx = build_feature_map(X_train)

print(f"Unique word types in X_train: {len(word_to_idx)}")
print("Sample words: ")
print(list(word_to_idx.keys())[:5])

Unique word types in X_train: 18280
Sample words: 
['glitzy', 'loose-jointed', 'refused', 'sci-fi', 'basis']


In [25]:
# convert our strings into count vectors
X_train_vec = extract_features(word_to_idx, X_train)
X_test_vec = extract_features(word_to_idx, X_test)

from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(tol=1e1)
print(len(X_train_vec), len(Y_train))
classifier.fit(X_train_vec, Y_train)

# create a truncated version of the training set so we have a second model to compare to
X_train_vec_truncate = extract_features(word_to_idx, [x[:100] for x in X_train])
classifier_truncate = LogisticRegression(tol=1e1)
classifier_truncate.fit(X_train_vec_truncate, Y_train)

151450 8544


10.0