In [78]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from collections import Counter

In [79]:
dev_data = pd.read_csv('./dev.csv')
train_data = pd.read_csv('./train.csv')
test_data = pd.read_csv('./test.csv')

In [80]:
def get_embeddings(data, word_vectors):
    embeddings = np.zeros((len(data), word_vectors.vector_size))

    for i, sentence in enumerate(data['sentence']):
        words = sentence.split()
        valid_vectors = [word_vectors[word] for word in words if word in word_vectors]

        if valid_vectors:
            embeddings[i] = np.mean(valid_vectors, axis=0)

    return embeddings

def get_embeddings_pruned(data, word_vectors, n):
    words = [word for sentence in data['sentence'] for word in sentence.split()]
    words_freq = Counter(words)
    pruned_words = {word for word, freq in words_freq.items() if freq > n}

    embeddings = np.zeros((len(data), word_vectors.vector_size))

    for i, sentence in enumerate(data['sentence']):
        words = sentence.split()
        valid_vectors = [word_vectors[word] for word in words if word in word_vectors and word in pruned_words]

        if valid_vectors:
            embeddings[i] = np.mean(valid_vectors, axis=0)

    return embeddings

def get_one_hot_ids(data):
    one_hot_ids = {}
    id = 0

    for dataset in data:
        for sentence in dataset['sentence']:
            for word in sentence.split(' '):
                if word not in one_hot_ids:
                    one_hot_ids[word] = id
                    id += 1

    return one_hot_ids

def get_one_hot(data, ids):
    one_hot = np.zeros((len(data), len(ids)))
    for i, sentence in enumerate(data['sentence']):
        for word in sentence.split(' '):
            if word not in ids:
                continue

            one_hot[i][ids[word]] += 1

    return one_hot

def train(X_train, y_train, X_dev, y_dev):
    best_model = None
    best_rate = (0, 101)

    for k in range(1, 101, 2):
        classifier = KNeighborsClassifier(n_neighbors=k, metric='euclidean')
        model = classifier.fit(X_train, y_train)

        predictions = model.predict(X_train)
        train_error_rate = (1 - accuracy_score(y_train, predictions)) * 100
        train_positive_rate = np.mean(predictions == "+") * 100

        predictions = model.predict(X_dev)
        dev_error_rate = (1 - accuracy_score(y_dev, predictions)) * 100
        dev_positive_rate = np.mean(predictions == "+") * 100

        if dev_error_rate < best_rate[1]:
            best_model = model
            best_rate = (k, dev_error_rate)

        print(f"k={k}   train_err {train_error_rate:.1f}% (+: {train_positive_rate:.1f}%)   dev_err {dev_error_rate:.1f}% (+: {dev_positive_rate:.1f}%)")

    print(f"\nBest dev error rate is {best_rate[1]:.1f}, when k = {best_rate[0]}")

    return best_model

## 1 Word Embeddings
### 1.1 Load and Query


In [81]:
from gensim.models import KeyedVectors

In [82]:
wv = KeyedVectors.load('embs_train.kv')

### 1.2 Vector Similarity

In [83]:
wv.most_similar('wonderful', topn=10)

[('marvelous', 0.8188857436180115),
 ('fantastic', 0.8047919869422913),
 ('great', 0.7647868990898132),
 ('fabulous', 0.7614760398864746),
 ('terrific', 0.7420831918716431),
 ('lovely', 0.7320095896720886),
 ('amazing', 0.7263179421424866),
 ('beautiful', 0.6854085922241211),
 ('magnificent', 0.6633867025375366),
 ('delightful', 0.6574996113777161)]

In [84]:
wv.most_similar('awful', topn=10)

[('horrible', 0.7597667574882507),
 ('terrible', 0.7478911280632019),
 ('dreadful', 0.7218177318572998),
 ('horrid', 0.6720177531242371),
 ('atrocious', 0.6626645922660828),
 ('ugly', 0.6236302852630615),
 ('lousy', 0.6135216951370239),
 ('unbelievable', 0.6068726181983948),
 ('appalling', 0.6061566472053528),
 ('hideous', 0.5811460614204407)]

In [85]:
wv.most_similar('ai', topn=10)

[('te', 0.6197360754013062),
 ('mai', 0.5895475149154663),
 ('di', 0.561583399772644),
 ('cosa', 0.5554686784744263),
 ('sua', 0.5469498038291931),
 ('ga', 0.53242027759552),
 ('se', 0.529657244682312),
 ('sia', 0.5292683839797974),
 ('han', 0.5262174010276794),
 ('uma', 0.5229132175445557)]

In [86]:
wv.most_similar('artificial', topn=10)

[('synthetic', 0.48184075951576233),
 ('unnatural', 0.431232213973999),
 ('illusion', 0.4310288727283478),
 ('sterile', 0.3774666488170624),
 ('plastic', 0.36900731921195984),
 ('imaginary', 0.36322855949401855),
 ('prosthetic', 0.3631141483783722),
 ('artificiality', 0.3588675856590271),
 ('natural', 0.33741024136543274),
 ('stimulation', 0.3294525444507599)]

In [87]:
wv.most_similar('life', topn=10)

[('lives', 0.6027060151100159),
 ('lifestyle', 0.44911324977874756),
 ('lifetime', 0.42942190170288086),
 ('living', 0.4151720702648163),
 ('family', 0.41159167885780334),
 ('everyday', 0.40794867277145386),
 ('humanity', 0.40773284435272217),
 ('childhood', 0.4068482220172882),
 ('society', 0.4066685140132904),
 ('motherhood', 0.4046342372894287)]

### 1.3 Word Analogy

In [88]:
wv.most_similar(positive=['sister', 'man'], negative=['woman'], topn=10)

[('brother', 0.7966989874839783),
 ('uncle', 0.6753759980201721),
 ('nephew', 0.6596081852912903),
 ('son', 0.6472460031509399),
 ('father', 0.6398823857307434),
 ('brothers', 0.6266913414001465),
 ('dad', 0.5981076955795288),
 ('siblings', 0.5654128789901733),
 ('daughter', 0.5610913634300232),
 ('sons', 0.5580724477767944)]

In [89]:
wv.most_similar(positive=['harder', 'fast'], negative=['hard'], topn=10)

[('faster', 0.7064899206161499),
 ('rapidly', 0.5021132826805115),
 ('easier', 0.48843100666999817),
 ('slow', 0.4575234651565552),
 ('quickly', 0.4370786249637604),
 ('bigger', 0.4148872196674347),
 ('cheaper', 0.41006121039390564),
 ('louder', 0.409576416015625),
 ('slowly', 0.40936195850372314),
 ('smarter', 0.40232229232788086)]

In [90]:
wv.most_similar(positive=['breathe', 'death'], negative=['life'], topn=5)

[('suffocation', 0.4820758104324341),
 ('suffocated', 0.46953943371772766),
 ('suffocate', 0.4326365888118744),
 ('breathing', 0.4223504066467285),
 ('drowning', 0.3890712857246399)]

In [91]:
wv.most_similar(positive=['speed', 'snail'], negative=['horse'], topn=5)

[('speeds', 0.5445727705955505),
 ('slow', 0.4046669900417328),
 ('slowness', 0.36570635437965393),
 ('speedy', 0.36554884910583496),
 ('faster', 0.3575904071331024)]

In [92]:
wv.most_similar(positive=['army', 'gangster'], negative=['soldier'], topn=5)

[('mafia', 0.571354866027832),
 ('underworld', 0.5466997027397156),
 ('mob', 0.497639536857605),
 ('gangs', 0.4962001144886017),
 ('gang', 0.4387965500354767)]

## 2 Better Perceptron using Embeddings

In [93]:
best_native_model = None
best_one_hot_model = None
train_embeddings = get_embeddings(train_data, wv)
dev_embeddings = get_embeddings(dev_data, wv)

### 2.1 Sentence Embedding and k-NN
#### 2.1.1

In [94]:
distances = np.linalg.norm(train_embeddings - train_embeddings[0], axis=1)
distances[0] = np.inf

train_data.iloc[[0, np.argmin(distances)]]

Unnamed: 0,id,sentence,target
0,0,"it 's a tour de force , written and directed so quietly that it 's implosion rather than explosion you fear",+
2061,2061,a semi autobiographical film that 's so sloppily written and cast that you can not believe anyone more central to the creation of bugsy than the caterer had anything to do with it,-


#### 2.1.2

In [95]:
distances = np.linalg.norm(train_embeddings - train_embeddings[1], axis=1)
distances[1] = np.inf

train_data.iloc[[1, np.argmin(distances)]]

Unnamed: 0,id,sentence,target
1,1,"places a slightly believable love triangle in a difficult to swallow setting , and then disappointingly moves the story into the realm of an improbable thriller",-
4205,4205,the plan to make enough into an inspiring tale of survival wrapped in the heart pounding suspense of a stylish psychological thriller ' has flopped as surely as a souffl gone wrong,-


#### 2.1.3

In [96]:
best_native_model = train(train_embeddings, train_data['target'], dev_embeddings, dev_data['target'])

k=1   train_err 0.0% (+: 50.0%)   dev_err 37.0% (+: 52.2%)
k=3   train_err 16.1% (+: 50.5%)   dev_err 34.8% (+: 50.0%)
k=5   train_err 19.8% (+: 50.0%)   dev_err 34.5% (+: 48.3%)
k=7   train_err 21.2% (+: 49.9%)   dev_err 33.8% (+: 49.0%)
k=9   train_err 22.0% (+: 49.4%)   dev_err 31.9% (+: 49.1%)
k=11   train_err 22.5% (+: 49.3%)   dev_err 30.7% (+: 49.1%)
k=13   train_err 22.8% (+: 49.1%)   dev_err 31.3% (+: 48.9%)
k=15   train_err 23.6% (+: 49.0%)   dev_err 30.2% (+: 49.6%)
k=17   train_err 23.5% (+: 48.9%)   dev_err 31.3% (+: 47.5%)
k=19   train_err 23.6% (+: 48.1%)   dev_err 30.9% (+: 47.7%)
k=21   train_err 24.0% (+: 48.0%)   dev_err 30.9% (+: 47.5%)
k=23   train_err 24.1% (+: 48.1%)   dev_err 31.2% (+: 46.6%)
k=25   train_err 24.0% (+: 47.9%)   dev_err 30.2% (+: 45.8%)
k=27   train_err 24.4% (+: 47.7%)   dev_err 30.4% (+: 46.0%)
k=29   train_err 24.3% (+: 47.6%)   dev_err 29.3% (+: 46.5%)
k=31   train_err 24.3% (+: 47.3%)   dev_err 29.6% (+: 46.6%)
k=33   train_err 24.3% (+: 46.

#### 2.1.4

In [97]:
one_hot_ids = get_one_hot_ids([train_data, dev_data])
train_one_hot = get_one_hot(train_data, one_hot_ids)
dev_one_hot = get_one_hot(dev_data, one_hot_ids)

best_one_hot_model = train(train_one_hot, train_data['target'], dev_one_hot, dev_data['target'])

k=1   train_err 0.0% (+: 50.0%)   dev_err 42.8% (+: 46.8%)
k=3   train_err 22.5% (+: 48.0%)   dev_err 41.4% (+: 47.0%)
k=5   train_err 27.1% (+: 46.5%)   dev_err 41.8% (+: 45.8%)
k=7   train_err 30.8% (+: 46.7%)   dev_err 40.6% (+: 46.8%)
k=9   train_err 31.3% (+: 47.1%)   dev_err 40.3% (+: 49.1%)
k=11   train_err 32.3% (+: 48.3%)   dev_err 41.8% (+: 49.2%)
k=13   train_err 33.4% (+: 49.2%)   dev_err 41.7% (+: 50.7%)
k=15   train_err 33.7% (+: 51.0%)   dev_err 42.9% (+: 51.7%)
k=17   train_err 34.3% (+: 52.1%)   dev_err 42.9% (+: 54.1%)
k=19   train_err 34.5% (+: 52.6%)   dev_err 42.6% (+: 54.6%)
k=21   train_err 34.9% (+: 54.4%)   dev_err 43.0% (+: 57.6%)
k=23   train_err 35.0% (+: 55.1%)   dev_err 43.1% (+: 58.5%)
k=25   train_err 36.3% (+: 55.0%)   dev_err 43.5% (+: 57.1%)
k=27   train_err 36.2% (+: 55.7%)   dev_err 43.3% (+: 57.9%)
k=29   train_err 35.9% (+: 56.2%)   dev_err 42.2% (+: 59.0%)
k=31   train_err 35.9% (+: 57.3%)   dev_err 43.0% (+: 59.4%)
k=33   train_err 36.0% (+: 59.

#### 2.1.5

In [98]:
test_predictions = best_native_model.predict(get_embeddings(test_data, wv))

test_data_copy = test_data.copy()
test_data_copy['target'] = test_predictions

test_data_copy.to_csv('test_part2.predicted.csv', index=False)

### 2.2 Reimplement Perceptron
#### 2.2.1

In [99]:
best_rate = (0, 101)
weights = np.zeros(wv.vector_size)
bias = 0

for epoch in range(1, 11):
    train_predictions = np.empty(len(train_data), dtype=str)
    dev_predictions = np.empty(len(dev_data), dtype=str)

    for i, embedding in enumerate(train_embeddings):
        prediction = np.dot(weights, embedding) + bias
        label = 1 if train_data.iloc[i]['target'] == "+" else -1

        if label * prediction <= 0:
            weights += label * embedding
            bias += label

    for i, embedding in enumerate(train_embeddings):
        prediction = np.dot(weights, embedding) + bias
        label = '+' if prediction > 0 else '-'
        train_predictions[i] = label

    train_error_rate = (1 - accuracy_score(train_data['target'], train_predictions)) * 100
    train_positive_rate = np.mean(train_predictions == "+") * 100

    for i, embedding in enumerate(dev_embeddings):
        prediction = np.dot(weights, embedding) + bias
        label = '+' if prediction > 0 else '-'
        dev_predictions[i] = label

    dev_error_rate = (1 - accuracy_score(dev_data['target'], dev_predictions)) * 100
    dev_positive_rate = np.mean(dev_predictions == "+") * 100

    if dev_error_rate < best_rate[1]:
        best_rate = (epoch, dev_error_rate)

    print(f"Epoch={epoch}   train_err {train_error_rate:.1f}% (+: {train_positive_rate:.1f}%)   dev_err {dev_error_rate:.1f}% (+: {dev_positive_rate:.1f}%)")

print(f"\nBest dev error rate is {best_rate[1]:.1f}, when epoch = {best_rate[0]}")

Epoch=1   train_err 27.8% (+: 31.1%)   dev_err 32.5% (+: 28.5%)
Epoch=2   train_err 30.2% (+: 24.3%)   dev_err 36.0% (+: 21.4%)
Epoch=3   train_err 35.5% (+: 16.2%)   dev_err 39.2% (+: 15.0%)
Epoch=4   train_err 37.0% (+: 14.2%)   dev_err 41.4% (+: 12.2%)
Epoch=5   train_err 27.9% (+: 28.5%)   dev_err 33.7% (+: 24.9%)
Epoch=6   train_err 25.2% (+: 35.6%)   dev_err 31.3% (+: 32.3%)
Epoch=7   train_err 31.4% (+: 22.3%)   dev_err 36.2% (+: 19.6%)
Epoch=8   train_err 34.6% (+: 17.7%)   dev_err 39.1% (+: 15.7%)
Epoch=9   train_err 38.0% (+: 13.4%)   dev_err 41.8% (+: 11.8%)
Epoch=10   train_err 35.0% (+: 17.4%)   dev_err 39.0% (+: 16.0%)

Best dev error rate is 31.3, when epoch = 6


#### 2.2.2

In [104]:
def train_avg_perceptron(train_embeddings, dev_embeddings):
    best_rate = (0, 101)
    weights = np.zeros(wv.vector_size)
    bias = 0
    total_weights_updates = np.zeros(wv.vector_size)
    total_bias_updates = 0
    total_updates = 0

    for epoch in range(1, 11):
        train_predictions = np.empty(len(train_data), dtype=str)
        dev_predictions = np.empty(len(dev_data), dtype=str)

        for i, embedding in enumerate(train_embeddings):
            prediction = np.dot(weights, embedding) + bias
            label = 1 if train_data.iloc[i]['target'] == "+" else -1

            if label * prediction <= 0:
                weights += label * embedding
                bias += label
                total_weights_updates += total_updates * embedding * label
                total_bias_updates += total_updates * label

            total_updates += 1

        avg_weights = weights - (total_weights_updates / total_updates)
        avg_bias = bias - (total_bias_updates / total_updates)

        for i, embedding in enumerate(train_embeddings):
            prediction = np.dot(avg_weights, embedding) + avg_bias
            label = '+' if prediction > 0 else '-'
            train_predictions[i] = label

        train_error_rate = (1 - accuracy_score(train_data['target'], train_predictions)) * 100
        train_positive_rate = np.mean(train_predictions == "+") * 100

        for i, embedding in enumerate(dev_embeddings):
            prediction = np.dot(avg_weights, embedding) + avg_bias
            label = '+' if prediction > 0 else '-'
            dev_predictions[i] = label

        dev_error_rate = (1 - accuracy_score(dev_data['target'], dev_predictions)) * 100
        dev_positive_rate = np.mean(dev_predictions == "+") * 100

        if dev_error_rate < best_rate[1]:
            best_rate = (epoch, dev_error_rate)

        print(f"Epoch={epoch}   train_err {train_error_rate:.1f}% (+: {train_positive_rate:.1f}%)   dev_err {dev_error_rate:.1f}% (+: {dev_positive_rate:.1f}%)")

    print(f"\nBest dev error rate is {best_rate[1]:.1f}, when epoch = {best_rate[0]}")

    return (avg_weights, avg_bias)

avg_perceptron_model = train_avg_perceptron(train_embeddings, dev_embeddings)

Epoch=1   train_err 21.6% (+: 48.8%)   dev_err 25.4% (+: 49.2%)
Epoch=2   train_err 21.1% (+: 48.9%)   dev_err 25.3% (+: 48.3%)
Epoch=3   train_err 20.9% (+: 48.8%)   dev_err 25.2% (+: 48.6%)
Epoch=4   train_err 20.9% (+: 48.6%)   dev_err 25.0% (+: 47.4%)
Epoch=5   train_err 20.8% (+: 48.7%)   dev_err 24.2% (+: 47.8%)
Epoch=6   train_err 20.9% (+: 48.6%)   dev_err 24.3% (+: 47.9%)
Epoch=7   train_err 20.9% (+: 48.8%)   dev_err 24.5% (+: 47.7%)
Epoch=8   train_err 21.0% (+: 48.9%)   dev_err 24.4% (+: 48.0%)
Epoch=9   train_err 20.8% (+: 48.9%)   dev_err 24.3% (+: 48.1%)
Epoch=10   train_err 20.8% (+: 49.0%)   dev_err 24.6% (+: 47.8%)

Best dev error rate is 24.2, when epoch = 5


#### 2.2.4

In [105]:
train_embeddings_pruned = get_embeddings_pruned(train_data, wv, 1)
dev_embeddings_pruned = get_embeddings_pruned(dev_data, wv, 1)

avg_perceptron_pruned_model = train_avg_perceptron(train_embeddings_pruned, dev_embeddings_pruned)

Epoch=1   train_err 22.4% (+: 49.0%)   dev_err 28.1% (+: 47.5%)
Epoch=2   train_err 21.9% (+: 49.3%)   dev_err 27.3% (+: 46.7%)
Epoch=3   train_err 21.8% (+: 49.2%)   dev_err 27.4% (+: 47.2%)
Epoch=4   train_err 21.6% (+: 49.2%)   dev_err 26.8% (+: 47.2%)
Epoch=5   train_err 21.7% (+: 49.1%)   dev_err 26.8% (+: 47.2%)
Epoch=6   train_err 21.7% (+: 49.0%)   dev_err 27.0% (+: 46.6%)
Epoch=7   train_err 21.8% (+: 49.1%)   dev_err 26.9% (+: 47.1%)
Epoch=8   train_err 21.7% (+: 49.1%)   dev_err 26.6% (+: 47.2%)
Epoch=9   train_err 21.8% (+: 49.1%)   dev_err 26.7% (+: 47.5%)
Epoch=10   train_err 21.8% (+: 49.3%)   dev_err 26.7% (+: 47.3%)

Best dev error rate is 26.6, when epoch = 8


#### 2.2.5

In [108]:
one_hot_dev_predictions = best_native_model.predict(get_embeddings(dev_data, wv))
perceptron_predictions = np.empty(len(dev_data), dtype=str)

for i, embedding in enumerate(get_embeddings(dev_data, wv)):
    prediction = np.dot(avg_perceptron_model[0], embedding) + avg_perceptron_model[1]
    label = '+' if prediction > 0 else '-'
    perceptron_predictions[i] = label

for i, label in enumerate(dev_data['target']):
    if label == perceptron_predictions[i] and label != one_hot_dev_predictions[i]:
        print(dev_data.iloc[i]["sentence"])
        print(f'Word2vec guess {perceptron_predictions[i]} (correct)')
        print(f'Word2vec guess {one_hot_dev_predictions[i]} (incorrect)\n')

a real audience pleaser that will strike a chord with anyone who 's ever waited in a doctor 's office , emergency room , hospital bed or insurance company office
Word2vec guess + (correct)
Word2vec guess - (incorrect)

get out your pooper scoopers
Word2vec guess - (correct)
Word2vec guess + (incorrect)

you 've already seen city by the sea under a variety of titles , but it 's worth yet another visit
Word2vec guess + (correct)
Word2vec guess - (incorrect)

if signs is a good film , and it is , the essence of a great one is in there somewhere
Word2vec guess + (correct)
Word2vec guess - (incorrect)

although largely a heavy handed indictment of parental failings and the indifference of spanish social workers and legal system towards child abuse , the film retains ambiguities that make it well worth watching
Word2vec guess + (correct)
Word2vec guess - (incorrect)

suffers from unlikable characters and a self conscious sense of its own quirky hipness
Word2vec guess - (correct)
Word2vec gue

#### 2.2.6

In [113]:
predictions = np.empty(len(test_data), dtype=str)

for i, embedding in enumerate(get_embeddings(test_data, wv)):
    prediction = np.dot(avg_perceptron_model[0], embedding) + avg_perceptron_model[1]
    label = '+' if prediction > 0 else '-'
    predictions[i] = label

test_data_copy = test_data.copy()
test_data_copy['target'] = predictions

test_data_copy.to_csv('test_part2_perceptron.predicted.csv', index=False)

In [114]:
predictions = np.empty(len(test_data), dtype=str)

for i, embedding in enumerate(get_embeddings(test_data, wv)):
    prediction = np.dot(avg_perceptron_pruned_model[0], embedding) + avg_perceptron_pruned_model[1]
    label = '+' if prediction > 0 else '-'
    predictions[i] = label

test_data_copy = test_data.copy()
test_data_copy['target'] = predictions

test_data_copy.to_csv('test_part2_perceptron_pruned.predicted.csv', index=False)

## 3 Try some other learning algorithms with sklearn

In [116]:
from sklearn.svm import SVC
import time

start_time = time.time()

classifier = SVC(kernel='linear', C=1.0, random_state=42)
classifier.fit(train_embeddings, train_data['target'].apply(lambda x: 1 if x == "+" else -1))

train_predictions = classifier.predict(train_embeddings)
train_labels_pred = ["+" if label == 1 else "-" for label in train_predictions]

train_error_rate = (1 - accuracy_score(train_data['target'], train_labels_pred)) * 100
train_positive_rate = train_labels_pred.count("+") / len(train_labels_pred) * 100

dev_predictions = classifier.predict(dev_embeddings)
dev_labels_pred = ["+" if label == 1 else "-" for label in dev_predictions]

dev_error_rate = (1 - accuracy_score(dev_data['target'], dev_labels_pred)) * 100
dev_positive_rate = dev_labels_pred.count("+") / len(dev_labels_pred) * 100

print(f"train_err {train_error_rate:.1f}% (+: {train_positive_rate:.1f}%)   dev_err {dev_error_rate:.1f}% (+: {dev_positive_rate:.1f}%)")
print(f"Runtime: {time.time() - start_time:.2f} sec")

train_err 20.4% (+: 47.9%)   dev_err 23.7% (+: 47.3%)
Runtime: 6.76 sec


In [120]:
test_predictions = classifier.predict(get_embeddings(test_data, wv))
test_labels_pred = ["+" if label == 1 else "-" for label in test_predictions]

test_data_copy = test_data.copy()
test_data_copy['target'] = test_labels_pred

test_data_copy.to_csv("test_part3.predicted.csv", index=False)
