# Training a perceptron classifier on movie reviews

## Testing `Perceptron()` using generic data

In [68]:
import numpy as np
import math
from sklearn.datasets import load_digits
from sklearn.linear_model import Perceptron

In [69]:
clf = Perceptron()

X, y = load_digits(return_X_y=True)
clf = Perceptron(tol=1e-3, random_state=0)
clf.fit(X, y)

clf.score(X, y)

0.9393433500278241

In [70]:
print(X.shape)
print(y.shape)

print(X[:20])
print(y[:20])

(1797, 64)
(1797,)
[[ 0.  0.  5. ...  0.  0.  0.]
 [ 0.  0.  0. ... 10.  0.  0.]
 [ 0.  0.  0. ... 16.  9.  0.]
 ...
 [ 0.  0.  1. ...  0.  0.  0.]
 [ 0.  0. 10. ...  0.  0.  0.]
 [ 0.  0.  6. ... 13. 11.  1.]]
[0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9]


In [71]:
type(X)

numpy.ndarray

## Movie reviews

### Reading word vector files

In [72]:
# test Perceptron classifier on word2vec embeddings

# get document representations
# results to be compared with retrofitted vectors

# code: faruqui et al. (2015)

# filename: txt file
def read_word_vecs(filename):
    """ Read all the word vectors and normalize them """
    word_vectors = {}
    #if filename.endswith('.gz'):
     #   fileObject = gzip.open(filename, 'r', encoding='utf-8')
    #else: fileObject = open(filename, 'r', encoding='utf-8')
    with open(filename, 'r', encoding='utf-8') as file:    
        for line in file:
            
            line = line.strip().lower()
            word = line.split()[0]
            word_vectors[word] = np.zeros(len(line.split())-1, dtype=float)
            for index, vec_val in enumerate(line.split()[1:]):
                word_vectors[word][index] = float(vec_val)
            # normalize weight vector 
            # a normalized vector points in the same direction as the original 
            # but has length 1
            word_vectors[word] /= math.sqrt((word_vectors[word]**2).sum() + 1e-6)
    
    #sys.stderr.write("Vectors read from: "+filename+" \n")
    return word_vectors

In [85]:
# for word, vec in read_word_vecs("embeddings/sample_vec.txt").items():
#     print(f"word: {word}")
#     print(f"vector: {vec}\n")

### Converting movie reviews to averaged vectors

In [74]:
# extract reviews from files, convert to review vectors to be stored in numpy matrix (nb review, 250)

def reviews_to_vecs(word_vectors, filename, vec_size):
    """extract review texts from a file and convert them to averaged word2vec embeddings"""
    
    review_vectors = []
    Y = []
    with open(filename, 'r', encoding='utf-8') as file:      
        for line in file:
            
            # initialize empty review vec of given size
            review_vec = np.zeros(vec_size, dtype=float)
            
            line = line.strip().lower()
            
            # gold label
            y = line.split()[0]
            Y.append(y)
            
            # normalized/tokenized moview review
            review = line.split()[1:]
            
            for word in review:
                if word in word_vectors:
                    review_vec += word_vectors[word]
                    
            # get average of word vectors by dividing sum by nb of words in review
            review_vec /= len(review)
            
            review_vectors.append(review_vec)
    
    # convert to numpy arrays
    X = np.array(review_vectors)
    Y = np.array(Y)
    
    return X, Y     

In [75]:
word_vectors = read_word_vecs("embeddings/vectors_datatxt_250_sg_w10_i5_c500_gensim_clean")

# getting train data
X_train, Y_train = reviews_to_vecs(word_vectors, 'stanford_raw_train.txt', vec_size=250)

print(X_train[:5])
print(f"Shape: {X_train.shape}\n")

print(Y_train[:5])
print(f"Shape: {Y_train.shape}")

[[ 0.0389759  -0.01411923  0.00649187 ...  0.02806024  0.00548851
   0.03144906]
 [ 0.06330878  0.01469497  0.01734737 ...  0.02479514  0.02445976
   0.03876584]
 [ 0.05804808  0.00525672  0.01786497 ...  0.03130196  0.01622859
   0.04347507]
 [ 0.04500403  0.02146169  0.01359255 ...  0.04601991  0.04227773
   0.07036946]
 [ 0.04966516  0.00886954  0.01860892 ...  0.02242931  0.01846928
   0.05119996]]
Shape: (6920, 250)

['1' '1' '1' '1' '1']
Shape: (6920,)


In [76]:
# getting test data
X_test, Y_test = reviews_to_vecs(word_vectors, 'stanford_raw_test.txt', vec_size=250)

print(X_test[:5])
print(f"Shape: {X_test.shape}\n")

print(Y_test[:5])
print(f"Shape: {Y_test.shape}")

[[ 0.08391141  0.00493503  0.0007743  ...  0.02950124  0.00507889
   0.06420888]
 [ 0.05613793  0.02151759  0.00791124 ...  0.02658185  0.00703825
   0.05526056]
 [ 0.06949902  0.05778605 -0.00777598 ...  0.01285676  0.0061893
   0.06091437]
 [ 0.05763503 -0.00330982  0.00640642 ...  0.03454663  0.02482536
   0.07949061]
 [ 0.07521692  0.01980447  0.01480977 ...  0.01391576  0.02573809
   0.06107445]]
Shape: (1821, 250)

['1' '1' '1' '1' '1']
Shape: (1821,)


## Training a `sklearn` Perceptron

In [77]:
# using pretrained vectors
clf = Perceptron()

clf = Perceptron(tol=1e-3, random_state=0)
clf.fit(X_train, Y_train)

print(f"Perceptron accuracy (original embeddings): {clf.score(X_test, Y_test): .2%}")

Perceptron accuracy (original embeddings):  76.94%


In [84]:
# using retrofitted vectors
retrofitted_word_vectors = read_word_vecs("embeddings/out_faruqui_250.txt")
X_train_retrofit, Y_train_retrofit = reviews_to_vecs(retrofitted_word_vectors, 'stanford_raw_train.txt', vec_size=250)
X_test_retrofit, Y_test_retrofit = reviews_to_vecs(retrofitted_word_vectors, 'stanford_raw_test.txt', vec_size=250)

clf = Perceptron()

clf = Perceptron(tol=1e-3, random_state=0)
clf.fit(X_train_retrofit, Y_train_retrofit)

print(f"Perceptron accuracy (retrofitted embeddings): {clf.score(X_test_retrofit, Y_test_retrofit): .2%}")

Perceptron accuracy (retrofitted embeddings):  79.08%


## Remarks
- We see a ~2% increase in accuracy compared to Faruqui et al.'s best results
- These word vectors were retrofitted using the PPDB lexicon which seems to improve results more than others in this task
- We should see similar results with our own retrofitted vectors