# Training a perceptron classifier on movie reviews

## Testing `Perceptron()` using generic data

In [68]:
import numpy as np
import math
from sklearn.datasets import load_digits
from sklearn.linear_model import Perceptron

In [69]:
clf = Perceptron()

X, y = load_digits(return_X_y=True)
clf = Perceptron(tol=1e-3, random_state=0)
clf.fit(X, y)

clf.score(X, y)

0.9393433500278241

In [70]:
print(X.shape)
print(y.shape)

print(X[:20])
print(y[:20])

(1797, 64)
(1797,)
[[ 0.  0.  5. ...  0.  0.  0.]
 [ 0.  0.  0. ... 10.  0.  0.]
 [ 0.  0.  0. ... 16.  9.  0.]
 ...
 [ 0.  0.  1. ...  0.  0.  0.]
 [ 0.  0. 10. ...  0.  0.  0.]
 [ 0.  0.  6. ... 13. 11.  1.]]
[0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9]


In [71]:
type(X)

numpy.ndarray

## Movie reviews

### Reading word vector files

In [72]:
# test Perceptron classifier on word2vec embeddings

# get document representations
# results to be compared with retrofitted vectors

# code: faruqui et al. (2015)

# filename: txt file
def read_word_vecs(filename):
    """ Read all the word vectors and normalize them """
    word_vectors = {}
    #if filename.endswith('.gz'):
     #   fileObject = gzip.open(filename, 'r', encoding='utf-8')
    #else: fileObject = open(filename, 'r', encoding='utf-8')
    with open(filename, 'r', encoding='utf-8') as file:    
        for line in file:
            
            line = line.strip().lower()
            word = line.split()[0]
            word_vectors[word] = np.zeros(len(line.split())-1, dtype=float)
            for index, vec_val in enumerate(line.split()[1:]):
                word_vectors[word][index] = float(vec_val)
            # normalize weight vector 
            # a normalized vector points in the same direction as the original 
            # but has length 1
            word_vectors[word] /= math.sqrt((word_vectors[word]**2).sum() + 1e-6)
    
    #sys.stderr.write("Vectors read from: "+filename+" \n")
    return word_vectors

In [83]:
# for word, vec in read_word_vecs("embeddings/sample_vec.txt").items():
#     print(f"word: {word}")
#     print(f"vector: {vec}\n")

word: biennials
vector: [-0.14307006  0.10847004 -0.03237701  0.09157304 -0.02972101  0.11704005
 -0.15579006  0.24033009 -0.08583103 -0.03929702  0.02585101 -0.01404701
  0.01675401  0.0091426  -0.20261008 -0.03194601 -0.12119005  0.08161403
  0.0080966   0.15618006 -0.32690013  0.17114007 -0.17094007 -0.19134008
  0.17856007 -0.22448009  0.22710009 -0.10867004  0.02586801  0.19715008
  0.21170008 -0.22244009  0.05631902 -0.09775404  0.14187006  0.08553403
 -0.03854802  0.23494009  0.13518005 -0.12680005 -0.13909005  0.27896011
  0.0025857  -0.17491007 -0.06347103  0.02757701  0.13556005 -0.0094128 ]

word: verplank
vector: [ 0.0619661   0.01386702  0.09789416  0.06929111  0.02512804 -0.02951505
 -0.00687561 -0.21864036 -0.02560104 -0.04749608 -0.09176715  0.08106213
  0.11705019  0.02564304 -0.05142108 -0.07437812  0.01258602 -0.02691504
  0.19103031 -0.15763026  0.05144808  0.13455022 -0.04002307  0.20770034
  0.01720403  0.22573037  0.08322814 -0.00795351 -0.11535019  0.17306028
  


word: mickle
vector: [ 1.21340182e-01  1.49090224e-02  1.62250244e-01  7.76041166e-02
 -7.16431076e-02  1.80420271e-02  1.07470161e-01 -4.62120694e-02
 -1.29300194e-01  5.12580770e-03  3.86330580e-03 -7.88091184e-02
  1.39170209e-01 -1.27480192e-01 -6.31420949e-02  8.58431290e-04
 -5.36420806e-03 -3.27840493e-02  4.65350699e-03  7.52371130e-02
 -1.13540171e-01 -2.43380366e-03  1.82250274e-01  1.08380163e-01
 -9.39541411e-02  3.04340457e-02  1.48350223e-01  1.35680204e-01
  2.49360375e-04  5.41530814e-02  1.47940222e-01  3.59090539e-02
 -1.20800181e-01 -5.22190784e-02  2.26530340e-01 -5.91200888e-02
  9.10541368e-02 -1.60840242e-01  2.20150331e-01  2.52460379e-01
 -1.56760235e-01 -2.12020319e-01 -3.73370561e-01  6.38270959e-02
  3.04520457e-01 -2.76510415e-01  3.45030518e-01 -1.01610153e-01]

word: kremlin
vector: [ 0.10872005 -0.26029011 -0.09297504 -0.06949603  0.11301005  0.10378004
  0.13769006  0.09068904 -0.17674007 -0.03874502 -0.20529009  0.09388604
 -0.01254201  0.21960009  0.


word: refinancings
vector: [-1.85409709e-01  5.09289200e-02  8.88818604e-02 -1.04449836e-01
 -3.32689477e-04  4.98559217e-02  1.53259759e-01 -1.90299701e-01
  1.20859810e-02 -3.18189500e-02 -2.50849606e-01  9.44338516e-02
 -1.23519806e-01  1.50379764e-01 -1.66009739e-01  9.41648521e-02
 -5.60569119e-01 -9.80248460e-02 -1.01839840e-01 -3.74409412e-01
  1.46979769e-01  2.57809595e-02  1.12749823e-02  2.83069555e-02
 -1.11039826e-01 -1.68439735e-01  1.19229813e-01 -1.39019782e-02
  1.98099689e-01 -1.41909777e-01  1.17269816e-01 -7.11538882e-02
  1.28669798e-01  1.44189773e-01 -8.62528645e-02 -5.87049078e-02
 -7.78418777e-02  5.25539174e-02  7.58048809e-02  6.42908990e-02
 -5.37749155e-02  2.60119591e-02  1.77059722e-01 -9.40388523e-03
  3.80949402e-02  1.84329710e-02 -7.95838750e-04  9.74648469e-03]

word: bailed-out
vector: [-0.07111997  0.01253799  0.06506997 -0.11407994 -0.03818198  0.07563796
  0.14010993 -0.06998997  0.10157995 -0.0032188  -0.22455989  0.09816895
 -0.11759994  0.084


word: instability
vector: [-0.16896944 -0.11314963 -0.14544952 -0.01155596  0.07186876 -0.24660919
  0.17591942 -0.04958784 -0.2436892  -0.11206963 -0.18846938  0.1531095
  0.01548495 -0.01587795  0.05047683 -0.34739886 -0.18492939 -0.1831694
 -0.00685218  0.24642919 -0.07910374  0.20925931 -0.14625952 -0.12422959
  0.02382292 -0.11983961 -0.15653948 -0.10256966  0.21474929  0.08234673
 -0.00307049 -0.05788181 -0.01101796  0.23914921 -0.13973954 -0.1508895
 -0.0310599  -0.05819281 -0.04503685 -0.05099483 -0.00299579  0.30866898
  0.01175196  0.04640885  0.18885938 -0.09341769 -0.05292383 -0.0914657 ]

word: personalizing
vector: [-0.21289073  0.05157918  0.16255056 -0.05505419  0.16552057 -0.13357046
 -0.04709216  0.14289049  0.01310104  0.23078079  0.08420429 -0.02192208
 -0.07537826  0.03338411 -0.13230045  0.13469046  0.09534933  0.04589116
  0.2347308   0.24205083  0.03508112  0.2336908   0.14398049 -0.10544036
 -0.14953051  0.00626072  0.06749423 -0.14105048  0.09915334  0.038250

### Converting movie reviews to averaged vectors

In [74]:
# extract reviews from files, convert to review vectors to be stored in numpy matrix (nb review, 250)

def reviews_to_vecs(word_vectors, filename, vec_size):
    """extract review texts from a file and convert them to averaged word2vec embeddings"""
    
    review_vectors = []
    Y = []
    with open(filename, 'r', encoding='utf-8') as file:      
        for line in file:
            
            # initialize empty review vec of given size
            review_vec = np.zeros(vec_size, dtype=float)
            
            line = line.strip().lower()
            
            # gold label
            y = line.split()[0]
            Y.append(y)
            
            # normalized/tokenized moview review
            review = line.split()[1:]
            
            for word in review:
                if word in word_vectors:
                    review_vec += word_vectors[word]
                    
            # get average of word vectors by dividing sum by nb of words in review
            review_vec /= len(review)
            
            review_vectors.append(review_vec)
    
    # convert to numpy arrays
    X = np.array(review_vectors)
    Y = np.array(Y)
    
    return X, Y     

In [75]:
word_vectors = read_word_vecs("embeddings/vectors_datatxt_250_sg_w10_i5_c500_gensim_clean")

# getting train data
X_train, Y_train = reviews_to_vecs(word_vectors, 'stanford_raw_train.txt', vec_size=250)

print(X_train[:5])
print(f"Shape: {X_train.shape}\n")

print(Y_train[:5])
print(f"Shape: {Y_train.shape}")

[[ 0.0389759  -0.01411923  0.00649187 ...  0.02806024  0.00548851
   0.03144906]
 [ 0.06330878  0.01469497  0.01734737 ...  0.02479514  0.02445976
   0.03876584]
 [ 0.05804808  0.00525672  0.01786497 ...  0.03130196  0.01622859
   0.04347507]
 [ 0.04500403  0.02146169  0.01359255 ...  0.04601991  0.04227773
   0.07036946]
 [ 0.04966516  0.00886954  0.01860892 ...  0.02242931  0.01846928
   0.05119996]]
Shape: (6920, 250)

['1' '1' '1' '1' '1']
Shape: (6920,)


In [76]:
# getting test data
X_test, Y_test = reviews_to_vecs(word_vectors, 'stanford_raw_test.txt', vec_size=250)

print(X_test[:5])
print(f"Shape: {X_test.shape}\n")

print(Y_test[:5])
print(f"Shape: {Y_test.shape}")

[[ 0.08391141  0.00493503  0.0007743  ...  0.02950124  0.00507889
   0.06420888]
 [ 0.05613793  0.02151759  0.00791124 ...  0.02658185  0.00703825
   0.05526056]
 [ 0.06949902  0.05778605 -0.00777598 ...  0.01285676  0.0061893
   0.06091437]
 [ 0.05763503 -0.00330982  0.00640642 ...  0.03454663  0.02482536
   0.07949061]
 [ 0.07521692  0.01980447  0.01480977 ...  0.01391576  0.02573809
   0.06107445]]
Shape: (1821, 250)

['1' '1' '1' '1' '1']
Shape: (1821,)


## Training a `sklearn` Perceptron

In [77]:
# using pretrained vectors
clf = Perceptron()

clf = Perceptron(tol=1e-3, random_state=0)
clf.fit(X_train, Y_train)

print(f"Perceptron accuracy (original embeddings): {clf.score(X_test, Y_test): .2%}")

Perceptron accuracy (original embeddings):  76.94%


In [84]:
# using retrofitted vectors
retrofitted_word_vectors = read_word_vecs("embeddings/out_faruqui_250.txt")
X_train_retrofit, Y_train_retrofit = reviews_to_vecs(retrofitted_word_vectors, 'stanford_raw_train.txt', vec_size=250)
X_test_retrofit, Y_test_retrofit = reviews_to_vecs(retrofitted_word_vectors, 'stanford_raw_test.txt', vec_size=250)

clf = Perceptron()

clf = Perceptron(tol=1e-3, random_state=0)
clf.fit(X_train_retrofit, Y_train_retrofit)

print(f"Perceptron accuracy (retrofitted embeddings): {clf.score(X_test_retrofit, Y_test_retrofit): .2%}")

Perceptron accuracy (retrofitted embeddings):  79.08%


## Remarks
- We see a ~2% increase in accuracy compared to Faruqui et al.'s best results
- These word vectors were retrofitted using the PPDB lexicon which seems to improve results more than others in this task
- We should see similar results with our own retrofitted vectors