# Predicting Country Capitals using Word Embeddings

## Imports and Dataset 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle

In [2]:
data = pd.read_csv('capitals.txt', delimiter=' ')
# This text file doesn't already contain the column titles
data.columns = ['city1', 'country1', 'city2', 'country2']
data.head(5)

Unnamed: 0,city1,country1,city2,country2
0,Athens,Greece,Bangkok,Thailand
1,Athens,Greece,Beijing,China
2,Athens,Greece,Berlin,Germany
3,Athens,Greece,Bern,Switzerland
4,Athens,Greece,Cairo,Egypt


In [3]:
word_embeddings = pickle.load(open("word_embeddings_subset.p", "rb"))
print(len(word_embeddings))

243


In [4]:
print(word_embeddings['France'].shape)

(300,)


In this notebook, we try to predict analogies, here using the analogy of city-country for a pair, given a city, find its country with the help of the word_embeddings data.

In [5]:
def cosine_similarity(A, B):
    '''
        Return cosine similarity between np arrays A, B.
        We know A.B = |A||B|cos(A, B); 
        So, cos(A, B) = (A.B)/|A||B|, where |A| means norm(A)
    '''
    
    return (np.dot(A, B)/(np.linalg.norm(A) * np.linalg.norm(B)))

In [6]:
print(cosine_similarity(word_embeddings['happy'], word_embeddings['sad']))

0.53546137


In [7]:
def euclidean_distance(A, B):
    return np.sqrt(np.sum((A - B) * (A - B), axis=0))

In [8]:
print(euclidean_distance(word_embeddings['king'], word_embeddings['queen']))

2.4796925


### Given city1, country1 and city2, find country2

In [9]:
def get_country(city1, country1, city2, word_embeddings):
    
    city1_emb    = word_embeddings[city1]
    country1_emb = word_embeddings[country1]
    city2_emb    = word_embeddings[city2]
    
    '''
        Country1 - City1 ~ Country2 - City2, where each term is the vector for the same.
        (Note: We can also multiply -1 obs, but then we get the vector for -Country2, so it's better this way itself)
        So,
        Country1 - City1 + City2 ~ Country2
        Let v = Country1 - City1 + City2, then from our data, find the vector that is closest to v. 
        This should be the best guess for Country2
    
    '''
    v = country1_emb - city1_emb + city2_emb
    
    best_similarity = -1
    country = ''
    
    for word in word_embeddings.keys():
        
        if(word == city1 or word == country1 or word == city2):
            continue
            
        word_emb = word_embeddings[word]
        similarity = cosine_similarity(word_emb, v)
        
        if(similarity > best_similarity): # Cosine similarity: Higher, the better (more similar)
            best_similarity = similarity
            country = (word, similarity)
        
    return country

In [10]:
print(get_country('Athens', 'Greece', 'Paris', word_embeddings))

('France', 0.6609893)


## Find the model's accuracy

In [11]:
def model_accuracy(word_embeddings, data):
    '''
        Input:
        word_embeddings: dictionary; dict[word] = its embedding Vector
        data: a pandas dataframe containing all the country and capital city pairs

        Output:
        accuracy: the accuracy of the model
        
    '''
    
    correct_predictions = 0
    
    for i, row in data.iterrows():
        city1, country1, city2, country2 = row[0], row[1], row[2], row[3]
        
        predicted_country, _ = get_country(city1, country1, city2, word_embeddings)
        
        if(predicted_country == country2):
            correct_predictions += 1
    
    accuracy = correct_predictions/len(data)
    return accuracy

In [12]:
accuracy = model_accuracy(word_embeddings, data)
print(f"Accuracy is {accuracy:.2f}")

Accuracy is 0.92
