In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
from utils_capital import get_vectors

In [6]:
data = pd.read_csv('capitals.txt', delimiter=' ')
data.columns = ['city1', 'country1', 'city2', 'country2']

print(data.shape)
data.head()

(4951, 4)


Unnamed: 0,city1,country1,city2,country2
0,Athens,Greece,Bangkok,Thailand
1,Athens,Greece,Beijing,China
2,Athens,Greece,Berlin,Germany
3,Athens,Greece,Bern,Switzerland
4,Athens,Greece,Cairo,Egypt


In [13]:
import nltk
from gensim.models import KeyedVectors

In [15]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Anubhav\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [16]:
embeddings = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [18]:
f = open('capitals.txt', 'r').read()

set_words = set(nltk.word_tokenize(f))

select_words = ['king', 'queen', 'oil', 'gas', 'happy', 'sad', 'city', 'town', 'village', 'country', 'continent', 'petroleum', 'joyful']

for w in select_words:
    set_words.add(w)

In [21]:
def get_word_embeddings(embeddings):
    word_embeddings = {}
    
    for word in embeddings.vocab:
        if word in set_words:
            word_embeddings[word] = embeddings[word]
    return word_embeddings

In [22]:
# Testing Function
word_embeddings = get_word_embeddings(embeddings)
print(len(word_embeddings))
pickle.dump(word_embeddings, open("word_embeddings_ak.p", "wb"))

243


In [23]:
word_embeddings = pickle.load(open("word_embeddings_ak.p", "rb"))
len(word_embeddings)

243

In [25]:
print(f"Dimension: {word_embeddings['Spain'].shape[0]}")

Dimension: 300


In [26]:
def cosine_similarity(A, B):
    dot = np.dot(A, B)
    norm_A = np.sqrt(np.dot(A, A))
    norm_B = np.sqrt(np.dot(B, B))
    cos = dot / (norm_A * norm_B)
    return cos

In [27]:
king = word_embeddings['king']
queen = word_embeddings['queen']

cosine_similarity(king, queen)

0.6510957

In [28]:
def euclidean(A, B):
    d = np.linalg.norm(A - B)
    return d

In [29]:
euclidean(king, queen)

2.4796925

In [30]:
def get_country(city1, country1, city2, embeddings):
    group = set((city1, country1, city2))
    
    city1_emb =  word_embeddings[city1]
    
    country1_emb = word_embeddings[country1]
    
    city2_emb = word_embeddings[city2]
    
    vec = country1_emb - city1_emb + city2_emb
    
    similarity = -1
    
    for word in embeddings.keys():
        if word not in group:
            word_emb = word_embeddings[word]
            cur_similarity = cosine_similarity(vec, word_emb)
            if cur_similarity > similarity:
                similarity = cur_similarity
                country = (word, similarity)
    return country

In [31]:
get_country('Athens', 'Greece', 'Cairo', word_embeddings)

('Egypt', 0.7626821)

In [36]:
def get_accuracy(word_embeddings, data):
    num_correct = 0 
    for i, row in data.iterrows():
        city1 = row['city1']
        country1 = row['country1']
        city2 = row['city2']
        country2 = row['country2']
        predicted_country2, _ = get_country(city1,country1,city2,word_embeddings)
        if predicted_country2 == country2:
            num_correct += 1
        m = len(data)
        accuracy = num_correct/m
    return accuracy

In [37]:
accuracy = get_accuracy(word_embeddings, data)
print(f"Accuracy is {accuracy:.2f}")

Accuracy is 0.92


In [None]:
def compute_pca(X, n_components= 2):
    X_demeaned = X - np.mean(X, axis = 0)
    covariance_matrix = np.cov(X_demeaned, rowvar=False)
    eigen_vals, eigen_vecs = np.linalg.eigh(covariance_matrix, UPLO='L')
    idx_sorted = np.argsort(eigen_vals)
    idx_sorted_decreasing = idx_sorted[::-1]
    eigen_vals_sorted = eigen_vals[:, idx_sorted_decreasing]
    eigen_vecs_sorted = eigen_vecs[:, idx_sorted_decreasing]
    
    eigen_vecs_subset = eigen_vecs_sorted[:,0:n_components]
    X_reduced = np.dot(eigen_vecs_subset.transpose(), X_demeaned.transpose()).transpose()