# Training a perceptron classifier on movie reviews

In [1]:
import numpy as np
import math
import gzip
from sklearn.datasets import load_digits
from sklearn.linear_model import Perceptron

## Movie reviews

### Reading word vector files

In [2]:
# test Perceptron classifier on word2vec embeddings

# get document representations
# results to be compared with retrofitted vectors


# filename: txt file
''' Read and normalize the embeddings '''
def read_embeddings(filename):
    print("\nReading embeddings...")
    # keys: words (string)
    # values: normalized vectors (NumPy array)
    embeds = {} 
  
    # using encoding='utf-8' to avoid UnicodeEncodeError on some systems
    with (gzip.open(filename, 'rt', encoding='utf-8') if filename.endswith('.gz') else open(filename, 'r', encoding='utf-8')) as file:
        for line in file:
            elements = line.strip().split()
            word = elements[0]
            vector = np.array([float(value) for value in elements[1:]], dtype=float)
        
            # normalize vector (Euclidean norm)
            norm = np.linalg.norm(vector)
            embeds[word] = vector / norm  
  
    return embeds

In [47]:
# for word, vec in read_word_vecs("embeddings/sample_vec.txt").items():
#     print(f"word: {word}")
#     print(f"vector: {vec}\n")

### Converting movie reviews to averaged vectors

In [3]:
# extract reviews from files, convert to review vectors to be stored in numpy matrix (nb review, 250)

# word_vectors: dict; filename: txt file; vec_size: int
def reviews_to_vecs(word_vectors, filename, vec_size, avg=False):
    """extract review texts from a file and convert them to averaged word2vec embeddings"""
    
    review_vectors = []
    Y = []
    with open(filename, 'r', encoding='utf-8') as file:      
        for line in file:
            
            # initialize empty review vec of given size
            review_vec = np.zeros(vec_size, dtype=float)
            
            line = line.lower().strip().split()
            
            # normalized/tokenized moview review, gold label
            review, y = line[1:], line[0]
            Y.append(y)
            
            # normalized/tokenized moview review
            #review = line.split()[1:]
            
            for word in review:
                if word in word_vectors:
                    review_vec += word_vectors[word]
            
            if avg:
                # get average of word vectors by dividing sum by nb of words in review
                review_vec /= len(review)
            
            review_vectors.append(review_vec)
    
    # convert to numpy arrays
    X = np.array(review_vectors)
    Y = np.array(Y)
    
    return X, Y     

In [4]:
word_vectors = read_embeddings("embeddings/vectors_datatxt_250_sg_w10_i5_c500_gensim_clean.gz")

# getting train data
X_train, Y_train = reviews_to_vecs(word_vectors, 'datasets/stanford_raw_train.txt', vec_size=250, avg=True)

print(X_train[:5])
print(f"Shape: {X_train.shape}\n")

print(Y_train[:5])
print(f"Shape: {Y_train.shape}")


Reading embeddings...
Reading embeddings done!
[[ 0.0389759  -0.01411923  0.00649187 ...  0.02806024  0.00548851
   0.03144906]
 [ 0.06330879  0.01469497  0.01734737 ...  0.02479514  0.02445976
   0.03876584]
 [ 0.05804809  0.00525672  0.01786497 ...  0.03130196  0.01622859
   0.04347507]
 [ 0.04500404  0.02146169  0.01359256 ...  0.04601991  0.04227773
   0.07036946]
 [ 0.04966516  0.00886954  0.01860892 ...  0.02242932  0.01846929
   0.05119996]]
Shape: (6920, 250)

['1' '1' '1' '1' '1']
Shape: (6920,)


In [5]:
# getting test data
X_test, Y_test = reviews_to_vecs(word_vectors, 'datasets/stanford_raw_test.txt', vec_size=250, avg=True)

print(X_test[:5])
print(f"Shape: {X_test.shape}\n")

print(Y_test[:5])
print(f"Shape: {Y_test.shape}")

[[ 0.08391141  0.00493503  0.0007743  ...  0.02950124  0.00507889
   0.06420889]
 [ 0.05613794  0.02151759  0.00791124 ...  0.02658185  0.00703825
   0.05526056]
 [ 0.06949902  0.05778605 -0.00777598 ...  0.01285677  0.0061893
   0.06091437]
 [ 0.05763503 -0.00330982  0.00640642 ...  0.03454663  0.02482536
   0.07949062]
 [ 0.07521693  0.01980447  0.01480978 ...  0.01391576  0.02573809
   0.06107445]]
Shape: (1821, 250)

['1' '1' '1' '1' '1']
Shape: (1821,)


## Training a `sklearn` Perceptron

In [6]:
# using pretrained vectors (using average)

clf = Perceptron()
clf.fit(X_train, Y_train)

print(f"Perceptron accuracy (original averaged embeddings): {clf.score(X_test, Y_test): .2%}")

Perceptron accuracy (original averaged embeddings):  76.94%


In [7]:
# using pretrained vectors (using sum)
X_train, Y_train = reviews_to_vecs(word_vectors, 'datasets/stanford_raw_train.txt', vec_size=250, avg=False)
X_test, Y_test = reviews_to_vecs(word_vectors, 'datasets/stanford_raw_test.txt', vec_size=250, avg=False)

clf = Perceptron()
clf.fit(X_train, Y_train)

print(f"Perceptron accuracy (original summed embeddings): {clf.score(X_test, Y_test): .2%}")

Perceptron accuracy (original summed embeddings):  68.86%


## Using our retrofitted vectors from `shafiabadi-duignan-modified.py`

In [8]:
# using our vectors retrofitted with the ppdb (averaged)
retrofitted_word_vectors = read_embeddings("embeddings/out_retrofitted_ppdb_250.txt")
X_train_retrofit, Y_train_retrofit = reviews_to_vecs(retrofitted_word_vectors, 'datasets/stanford_raw_train.txt', vec_size=250, avg=True)
X_test_retrofit, Y_test_retrofit = reviews_to_vecs(retrofitted_word_vectors, 'datasets/stanford_raw_test.txt', vec_size=250, avg=True)

clf = Perceptron(tol=1e-3, random_state=0)
clf.fit(X_train_retrofit, Y_train_retrofit)

print(f"Perceptron accuracy of retrofitted embeddings (averaged) with the PPDB: {clf.score(X_test_retrofit, Y_test_retrofit): .2%}")


Reading embeddings...
Reading embeddings done!
Perceptron accuracy of retrofitted embeddings (averaged) with the PPDB:  79.90%


In [9]:
# using our vectors retrofitted with the ppdb (summed)
retrofitted_word_vectors = read_embeddings("embeddings/out_retrofitted_ppdb_250.txt")
X_train_retrofit, Y_train_retrofit = reviews_to_vecs(retrofitted_word_vectors, 'datasets/stanford_raw_train.txt', vec_size=250, avg=False)
X_test_retrofit, Y_test_retrofit = reviews_to_vecs(retrofitted_word_vectors, 'datasets/stanford_raw_test.txt', vec_size=250, avg=False)

clf = Perceptron()

clf = Perceptron(tol=1e-3, random_state=0)
clf.fit(X_train_retrofit, Y_train_retrofit)

print(f"Perceptron accuracy of retrofitted embeddings (summed) with the PPDB: {clf.score(X_test_retrofit, Y_test_retrofit): .2%}")


Reading embeddings...
Reading embeddings done!
Perceptron accuracy of retrofitted embeddings (summed) with the PPDB:  74.90%


In [10]:
# using our vectors retrofitted with wn synonyms (averaged)
retrofitted_word_vectors = read_embeddings("embeddings/out_retrofitted_wn_syn_250.txt")
X_train_retrofit, Y_train_retrofit = reviews_to_vecs(retrofitted_word_vectors, 'datasets/stanford_raw_train.txt', vec_size=250, avg=True)
X_test_retrofit, Y_test_retrofit = reviews_to_vecs(retrofitted_word_vectors, 'datasets/stanford_raw_test.txt', vec_size=250, avg=True)


clf = Perceptron(tol=1e-3, random_state=0)
clf.fit(X_train_retrofit, Y_train_retrofit)

print(f"Perceptron accuracy of retrofitted embeddings (averaged) with WN synonyms: {clf.score(X_test_retrofit, Y_test_retrofit): .2%}")


Reading embeddings...
Reading embeddings done!
Perceptron accuracy of retrofitted embeddings (averaged) with WN synonyms:  79.90%


In [12]:
# using our vectors retrofitted with wn synonyms (summed)
retrofitted_word_vectors = read_embeddings("embeddings/out_retrofitted_wn_syn_250.txt")
X_train_retrofit, Y_train_retrofit = reviews_to_vecs(retrofitted_word_vectors, 'datasets/stanford_raw_train.txt', vec_size=250, avg=False)
X_test_retrofit, Y_test_retrofit = reviews_to_vecs(retrofitted_word_vectors, 'datasets/stanford_raw_test.txt', vec_size=250, avg=False)


clf = Perceptron(tol=1e-3, random_state=0)
clf.fit(X_train_retrofit, Y_train_retrofit)

print(f"Perceptron accuracy of retrofitted embeddings (summed) with WN synonyms: {clf.score(X_test_retrofit, Y_test_retrofit): .2%}")


Reading embeddings...
Reading embeddings done!
Perceptron accuracy of retrofitted embeddings (summed) with WN synonyms:  78.03%


In [13]:
# using our vectors retrofitted with wn synonyms, hypernyms and hyponyms (averaged)
retrofitted_word_vectors = read_embeddings("embeddings/out_retrofitted_wn_all_250.txt")
X_train_retrofit, Y_train_retrofit = reviews_to_vecs(retrofitted_word_vectors, 'datasets/stanford_raw_train.txt', vec_size=250, avg=True)
X_test_retrofit, Y_test_retrofit = reviews_to_vecs(retrofitted_word_vectors, 'datasets/stanford_raw_test.txt', vec_size=250, avg=True)


clf = Perceptron(tol=1e-3, random_state=0)
clf.fit(X_train_retrofit, Y_train_retrofit)

print(f"Perceptron accuracy of retrofitted embeddings (averaged) with WN (all relations): {clf.score(X_test_retrofit, Y_test_retrofit): .2%}")


Reading embeddings...
Reading embeddings done!
Perceptron accuracy of retrofitted embeddings (averaged) with WN (all relations):  76.61%


In [14]:
# using our vectors retrofitted with wn synonyms, hypernyms and hyponyms (summed)
retrofitted_word_vectors = read_embeddings("embeddings/out_retrofitted_wn_all_250.txt")
X_train_retrofit, Y_train_retrofit = reviews_to_vecs(retrofitted_word_vectors, 'datasets/stanford_raw_train.txt', vec_size=250, avg=False)
X_test_retrofit, Y_test_retrofit = reviews_to_vecs(retrofitted_word_vectors, 'datasets/stanford_raw_test.txt', vec_size=250, avg=False)

clf = Perceptron()

clf = Perceptron(tol=1e-3, random_state=0)
clf.fit(X_train_retrofit, Y_train_retrofit)

print(f"Perceptron accuracy of retrofitted embeddings (summed) with WN (all relations): {clf.score(X_test_retrofit, Y_test_retrofit): .2%}")


Reading embeddings...
Reading embeddings done!
Perceptron accuracy of retrofitted embeddings (summed) with WN (all relations):  77.27%


## Testing French word embeddings

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# tweets
french_data = pd.read_csv('datasets/french_tweets.csv')

In [60]:
X, y = french_data.iloc[:, 1], french_data.iloc[:, 0]
y[1526719]  
X[1526719]

"Oui, cela fonctionne mieux que de l'attendre à la fin, je me demande si j'ai le temps de suivre un bon blog."

In [61]:
import re


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    
# write data to file
def write_reviews(X, y, path):
    with open(path, 'w', encoding='utf-8') as file:
        for label, review in (zip(y, X)):
            # padding punct with whitespaces
            line = f"{label} {review}\n"
            line = re.sub('([:;\".,!?()])', r' \1 ', line)
            line = re.sub('\s{2,}', ' ', line)
            
            # add white space after apostrophe
            line = re.sub('([A-z]\')', r'\1 ', line)
            file.write(line)
   
    print(f"Reviews successfully written to {path}")

path_train = 'datasets/french_tweets_train.txt'
path_test = 'datasets/french_tweets_test.txt'

write_reviews(X_train, y_train, path_train)
write_reviews(X_test, y_test, path_test)

In [62]:
# french movie reviews
# french_movie_reviews_train = pd.read_csv('datasets/french_movie_reviews_train.csv')
# french_movie_reviews_test = pd.read_csv('datasets/french_movie_reviews_test.csv')

In [63]:
# french_movie_reviews_train

# X_train, y_train = french_movie_reviews_train.iloc[:, 2], french_movie_reviews_train.iloc[:, 3]
# X_test, y_test = french_movie_reviews_test.iloc[:, 2], french_movie_reviews_test.iloc[:, 3]

# path_train = 'datasets/french_movie_reviews_train.txt'
# path_test = 'datasets/french_movie_reviews_test.txt'

# write_reviews(X_train, y_train, path_train)
# write_reviews(X_test, y_test, path_test)

In [64]:
# testing pretrained vectors
word_vectors_pretrain_fr = read_embeddings("embeddings/vecs100-linear-frwiki")
X_train_pretrain, Y_train_pretrain = reviews_to_vecs(word_vectors_pretrain_fr, path_train, vec_size=100, avg=True)
X_test_pretrain, Y_test_pretrain = reviews_to_vecs(word_vectors_pretrain_fr, path_test, vec_size=100, avg=True)

clf = Perceptron(tol=1e-3, random_state=0)
clf.fit(X_train_pretrain, Y_train_pretrain)

print(f"Perceptron accuracy of French pretrained embeddings (averaged): {clf.score(X_test_pretrain, Y_test_pretrain): .2%}")


Reading embeddings...
Reading embeddings done!
Perceptron accuracy of French pretrained embeddings (averaged):  53.47%


In [65]:
# testing pretrained vectors
word_vectors_pretrain_fr = read_embeddings("embeddings/vecs100-linear-frwiki")
X_train_pretrain, Y_train_pretrain = reviews_to_vecs(word_vectors_pretrain_fr, path_train, vec_size=100, avg=False)
X_test_pretrain, Y_test_pretrain = reviews_to_vecs(word_vectors_pretrain_fr, path_test, vec_size=100, avg=False)

clf = Perceptron(tol=1e-3, random_state=0)
clf.fit(X_train_pretrain, Y_train_pretrain)

print(f"Perceptron accuracy of French pretrained embeddings (summed): {clf.score(X_test_pretrain, Y_test_pretrain): .2%}")


Reading embeddings...
Reading embeddings done!
Perceptron accuracy of French pretrained embeddings (summed):  63.70%


In [66]:
# using our french vectors retrofitted with wn synonyms (averaged)
retrofitted_word_vectors_fr = read_embeddings("embeddings/out_retrofitted_fr_wn_syn_100.txt")
X_train_retrofit, Y_train_retrofit = reviews_to_vecs(retrofitted_word_vectors_fr, path_train, vec_size=100, avg=True)
X_test_retrofit, Y_test_retrofit = reviews_to_vecs(retrofitted_word_vectors_fr, path_test, vec_size=100, avg=True)


clf = Perceptron(tol=1e-3, random_state=0)
clf.fit(X_train_retrofit, Y_train_retrofit)

print(f"Perceptron accuracy of retrofitted embeddings (averaged) with WN (syn): {clf.score(X_test_retrofit, Y_test_retrofit): .2%}")


Reading embeddings...
Reading embeddings done!
Perceptron accuracy of retrofitted embeddings (averaged) with WN (syn):  61.50%


In [67]:
# using our french vectors retrofitted with wn synonyms (summed)
retrofitted_word_vectors_fr = read_embeddings("embeddings/out_retrofitted_fr_wn_syn_100.txt")
X_train_retrofit, Y_train_retrofit = reviews_to_vecs(retrofitted_word_vectors_fr, path_train, vec_size=100, avg=False)
X_test_retrofit, Y_test_retrofit = reviews_to_vecs(retrofitted_word_vectors_fr, path_test, vec_size=100, avg=False)


clf = Perceptron(tol=1e-3, random_state=0)
clf.fit(X_train_retrofit, Y_train_retrofit)

print(f"Perceptron accuracy of retrofitted embeddings (summed) with WN (syn): {clf.score(X_test_retrofit, Y_test_retrofit): .2%}")


Reading embeddings...
Reading embeddings done!
Perceptron accuracy of retrofitted embeddings (summed) with WN (syn):  56.96%


In [68]:
# using our french vectors retrofitted with wn synonyms, hyponyms and hypernyms (averaged)
retrofitted_word_vectors_fr = read_embeddings("embeddings/out_retrofitted_fr_wn_all_100.txt")
X_train_retrofit, Y_train_retrofit = reviews_to_vecs(retrofitted_word_vectors_fr, path_train, vec_size=100, avg=True)
X_test_retrofit, Y_test_retrofit = reviews_to_vecs(retrofitted_word_vectors_fr, path_test, vec_size=100, avg=True)


clf = Perceptron(tol=1e-3, random_state=0)
clf.fit(X_train_retrofit, Y_train_retrofit)

print(f"Perceptron accuracy of retrofitted embeddings (averaged) with WN (all): {clf.score(X_test_retrofit, Y_test_retrofit): .2%}")


Reading embeddings...
Reading embeddings done!
Perceptron accuracy of retrofitted embeddings (averaged) with WN (all):  61.40%


In [69]:
# using our french vectors retrofitted with wn synonyms, hyponyms and hypernyms (summed)
retrofitted_word_vectors_fr = read_embeddings("embeddings/out_retrofitted_fr_wn_all_100.txt")
X_train_retrofit, Y_train_retrofit = reviews_to_vecs(retrofitted_word_vectors_fr, path_train, vec_size=100, avg=False)
X_test_retrofit, Y_test_retrofit = reviews_to_vecs(retrofitted_word_vectors_fr, path_test, vec_size=100, avg=False)


clf = Perceptron(tol=1e-3, random_state=0)
clf.fit(X_train_retrofit, Y_train_retrofit)

print(f"Perceptron accuracy of retrofitted embeddings (summed) with WN (all): {clf.score(X_test_retrofit, Y_test_retrofit): .2%}")


Reading embeddings...
Reading embeddings done!
Perceptron accuracy of retrofitted embeddings (summed) with WN (all):  62.48%


In [38]:
# using our french vectors retrofitted with the ppdb (averaged)
retrofitted_word_vectors = read_embeddings("embeddings/out_retrofitted_fr_ppdb_100.txt")
X_train_retrofit, Y_train_retrofit = reviews_to_vecs(retrofitted_word_vectors, path_train, vec_size=100, avg=True)
X_test_retrofit, Y_test_retrofit = reviews_to_vecs(retrofitted_word_vectors, path_test, vec_size=100, avg=True)


clf = Perceptron(tol=1e-3, random_state=0)
clf.fit(X_train_retrofit, Y_train_retrofit)

print(f"Perceptron accuracy of retrofitted embeddings (averaged) with the PPDB: {clf.score(X_test_retrofit, Y_test_retrofit): .2%}")


Reading embeddings...
Reading embeddings done!


NameError: name 'path_train' is not defined

In [71]:
# using our french vectors retrofitted with the ppdb (summed)
retrofitted_word_vectors = read_embeddings("embeddings/out_retrofitted_fr_ppdb_100.txt")
X_train_retrofit, Y_train_retrofit = reviews_to_vecs(retrofitted_word_vectors, path_train, vec_size=100, avg=False)
X_test_retrofit, Y_test_retrofit = reviews_to_vecs(retrofitted_word_vectors, path_test, vec_size=100, avg=False)


clf = Perceptron(tol=1e-3, random_state=0)
clf.fit(X_train_retrofit, Y_train_retrofit)

print(f"Perceptron accuracy of retrofitted embeddings (summed) with the PPDB: {clf.score(X_test_retrofit, Y_test_retrofit): .2%}")


Reading embeddings...
Reading embeddings done!
Perceptron accuracy of retrofitted embeddings (summed) with the PPDB:  64.40%
