### Q: What is the main difference between CBOW and Skip-Gram in comparison to previous models?
    Compared to feed forward and recurrent neural networks the new models are much simpler.


### Q: What is the benefit of this difference?

    Because of the much lower computational complexity, it is possible to compute very accurate high dimensional word vectors from a much larger data set. 

In [None]:
from scipy.stats import spearmanr
from random import randint
from nltk.corpus import brown
from random import shuffle
from gensim.models.keyedvectors import KeyedVectors

### Write a python method which reads the dataset into an appropriate format.

In [None]:
def filereader(textfile):
    #read file from disk into memory
    f = open(textfile)
    raw = f.read()
    f.close()
    
    entries = raw.split('\n')
    data = []
    for i in range(len(entries)):
        if(len(entries[i].split('\t')) > 2 and i != 0):
            entry = entries[i].split('\t')
            entry = tuple([entry[0], entry[1], float(entry[2])])
            data.append(entry)
        
    return data

### Load the pretrained binary word2vec embeddings with gensim

In [None]:
#load ws-353 data and word vectors
ws353 = filereader('ws-353/combined.tab')
word_vectors = KeyedVectors.load_word2vec_format('/home/maximilian/Downloads/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin', binary=True)

### Compute the pairwise word similarity between each word pair using gensim's similarity method.

In [None]:
human_scores = []
ai_scores = []

#iterate through data entries and compute gensim similarity
for entry in ws353:
    word1 = entry[0]
    word2 = entry[1]
    sim_score_human = entry[2]/10
    sim_score_ai = float(word_vectors.similarity(word1, word2))
    
    #store scores in comparable lists
    human_scores.append(sim_score_human)
    ai_scores.append(sim_score_ai)    

### Compute the coefficient between the values assigned by humans and your results

In [None]:
spearmanr(human_scores, ai_scores)

### Explain in two sentences what your resulting coefficient means.
    The resulting coefficient 0.7 means that gensim's similarity 
    method tends to rank word similarities like humans do. 
    That means that humans and the similarity method will agree 
    in 70 percent of cases that one given word pair is more similar than another.

# Task 2.2 Gamified Word Intrusion (4P)

In [None]:
#load corpus to draw words from
tagged_words = brown.tagged_words(tagset='universal')
length = len(tagged_words)

In [None]:
#define constraints for a suitable 'parent word'
def pw_constraints(tpw):
    word = tpw[0]
    tag = tpw[1]
    if(tag not in['NOUN','ADJ','ADV', 'VERB']):
        return False
    if(word in word_vectors.vocab.keys()):
        if(word_vectors.most_similar(word)[2][1] > 0.65):
            for simword in word_vectors.most_similar(word):
                if(simword[0] in ['is', 'be', 'have', 'can', 'would', 'do'] or str.lower(simword[0][:3]) == str.lower(word[:3])):
                    return False
            return True
    else:
        return False

#define constraints for a suitable 'outlier word'
def ow_constrainsts(tpw, tow):
    word1 = tpw[0]
    word2 = tow[0]
    tag = tow[1]
    if(tag not in['NOUN','ADJ','ADV', 'VERB']):
        return False
    if(word1 in word_vectors.vocab.keys() and word2 in word_vectors.vocab.keys()):
        if(float(word_vectors.similarity(word1, word2)) < 0.2):
            for simword in word_vectors.most_similar(word2):
                if(simword[0] not in ['is', 'be', 'have', 'can', 'would', 'do']):
                    return True
    else:
        return False

In [None]:
#define constraints for user input    
def getSelectionInput():
    val = 0
    while(val > 5 or val <= 0):
        userInput = input('Enter a number:')
        try:
           val = int(userInput)
        except ValueError:
           print("That's not an int!")
           val=0
    return val

### Find four words which are similar to each other according to the pretrained word2vec embeddings, and one intruder word.

In [None]:
#generate lists of similar words and 1 unsimilar word. store them in quintuple
quintuples = []
n_word_tuples = 5 #number of questions presented to the player

for i in range(n_word_tuples):
    
    #retreive a suitable random parent word
    tpw = tagged_words[randint(0,length-1)] #random tagged_parent_word drawn from brown corpus
    while(pw_constraints(tpw) is not True):     #make sure it's suitable according to my constraints
        tpw = tagged_words[randint(0,length-1)]
        
    
    #retreive a suitable outlier word
    tow = tagged_words[randint(0,length-1)] #random tagged outlier word drawn from brown corpus
    while(ow_constrainsts(tpw, tow) is not True): #make sure it's suitable according to my constraints
        tow = tagged_words[randint(0,length-1)]
        
    #generate quintuple of 4 similar and an outlier word
    q1 = word_vectors.most_similar(tpw[0])
    q = [tpw[0], q1[0][0],q1[1][0],q1[2][0], tow[0]]
    quintuples.append(q)

### Print the five words in random order, then query a human (i.e. yourself) to spot the intruder.

In [None]:
print("Spot the intruder!")

#initialize confusion matrix for accuracy computation
TP = 0
TN = 0
FP = 0
FN = 0


for quintuple in quintuples:
    #put words into random order and store the outlier
    outlier = quintuple[4]
    shuffle(quintuple)
    
    #Print the five words
    for j in range(len(quintuple)):
        print(str(j+1)+":  ", quintuple[j])
    print('\n')
    
    #Query user and and update confusion matrix based on correctness of user input
    if(outlier == quintuple[getSelectionInput()-1]):   
        TP += 1
        TN += 4
    else:
        FP += 1
        FN += 1
        TN += 3
    
    

#calculate and print accuracy    
accuracy = (TP+TN)/(TP+TN+FP+FN)

### Print the accuracy currently reached by the human.

In [None]:
print('accuracy:', accuracy)