# Generating incorrect answers for questions


## Imports

In [1]:
import gensim
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors

In [2]:
glove_file = 'embeddings\glove.6B.300d.txt'
tmp_file = 'embeddings\word2vec-glove.6B.300d.txt'

In [3]:
from gensim.scripts.glove2word2vec import glove2word2vec
glove2word2vec(glove_file, tmp_file)
model = KeyedVectors.load_word2vec_format(tmp_file)

## Basic cosime similarity on a single word

In [4]:
model.most_similar(positive=['robert'], topn=10)

[('william', 0.5641534328460693),
 ('james', 0.5441264510154724),
 ('j.', 0.5343754887580872),
 ('h.', 0.5330929756164551),
 ('d.', 0.5264898538589478),
 ('richard', 0.5263983011245728),
 ('john', 0.5211343765258789),
 ('henry', 0.5162783861160278),
 ('b.', 0.5118230581283569),
 ('l.', 0.508935809135437)]

In [5]:
def generate_distractors(answer, count):
    answer = str.lower(answer)
    
    ##Extracting closest words for the answer. 
    try:
        closestWords = model.most_similar(positive=[answer], topn=count)
    except:
        #In case the word is not in the vocabulary, or other problem not loading embeddings
        return []

    #Return count many distractors
    # retrieving only the text and taking count amount of samples 
    distractors = list(map(lambda x: x[0], closestWords))[0:count]
  
    return distractors

In [6]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [7]:
model.most_similar(positive=['write'], topn=10)

[('writing', 0.6969849467277527),
 ('read', 0.6291235089302063),
 ('wrote', 0.6251993179321289),
 ('written', 0.6065735816955566),
 ('publish', 0.5670630931854248),
 ("'d", 0.5343195796012878),
 ('writes', 0.5341792702674866),
 ('tell', 0.5337096452713013),
 ('you', 0.5316603779792786),
 ('books', 0.5285096168518066)]

In [8]:
model.most_similar(positive=['1912', 'balkan', 'war'], topn=10)

[('1914', 0.6271713972091675),
 ('1918', 0.6049230098724365),
 ('wars', 0.5950419306755066),
 ('1919', 0.5936863422393799),
 ('1915', 0.592497706413269),
 ('1913', 0.5884469747543335),
 ('1920', 0.5811989307403564),
 ('balkans', 0.580984354019165),
 ('1916', 0.5671558380126953),
 ('1921', 0.5582135319709778)]

## Using most common words and different approach for numerical and non-numerical words
- used most-common words to tag importance

def generate_distractors_phrase(phrase, count = 4):
    pass
    doc = nlp(phrase)
    # label to-drop 'at around 180 years' - [1, 1, 0, 1]

    wordImportance = [1] * len(doc)
        
    # common words
    for i in range(len(doc)):
        word = doc[i].lemma_.lower()  
        if word in StopWords.SkipWords:
            wordImportance[i] = 0
    
    # if multiple words left - keep the best tf/idf scores. just occurence would suffice for now
    wordOccurences = load_pickle('../../data/squad-v1/idf/word-occurances-paragraph.pkl')
    for token in doc:
        print(wordOccurences[token.text] if token.text in wordOccurences else 0)
    
    #for meaningful words, generate similar
    # keep formating - case sensitivity 
    # mix and match
    # relationship between mixed similar words?
    
    return wordImportance

generate_distractors_phrase('at around 180 years six to seven months')

In [57]:
import _pickle as cPickle
from pathlib import Path
import os
import datetime

# TODO: rename method names with '_'
def dump_pickle(content, fileName: str):
    """Save a python object as a pickle.
    
    Args:
        content: Python object.
        fileName (str): File name and path, relative to the current executed file.
    """
    pickleFile = open(fileName, 'wb')
    cPickle.dump(content, pickleFile, -1)
    pickleFile.close()


def load_pickle(fileName: str):
    """Load a python object saved as a pickle.
    
    Args:
        fileName (str): File name and path to the pickle object
    
    Returns:
        The python object saved in the pickle.
    """
    file = open(fileName, 'rb')
    content = cPickle.load(file)
    file.close()

    return content


def pickle_exists(fileName:str) -> bool:
    """Check whether a pickle exists.
    
    Args:
        fileName (str): File name and path to look for.
    
    Returns:
        bool: Whether it exists.
    """
    
    file = Path(fileName)

    if file.is_file():
        return True

    return False

def save_pickle_dated(content, path: str,  revision: str, contentName: str):
    """ Save a pickle file with date information in the name.
    
    Args:
        content: Python object to be pickled.
        path (str): Path to the save directory. 
        revision (str): Notable change from the previous version.
        contentName (str): Object type that's being saved. (e.g. test, dev, model)
    """
    now = datetime.datetime.now()
    pickleName = "{0}{1:02d}{2:02d}-{3}{4}-{5}-{6}.pkl".format(now.year, now.month, now.day, now.hour, now.minute, revision, contentName)

    dump_pickle(content, path + pickleName)


- separate importance tagging for numerals and non-numerals

### Numerals and Non-numerals importance tagging
Decided upon having two separate paths for tagging important words. One for numerals and one for non-numerals.

#### Words

In [58]:
def get_most_important_words(phrase, count = 2):
    doc = nlp(phrase)
    # label to-drop 'at around 180 years' - [1, 1, 0, 1]
    wordImportance = [1] * len(doc)
    wordOccurences= load_pickle('idf/word-occurances-paragraph.pkl')
    
    phraseOcc = [0] * len(doc)
    
    for i in range(len(doc)):
        occurences = wordOccurences[doc[i].text] if doc[i].text in wordOccurences else 0
        phraseOcc[i] = occurences 
        
    treshold = sorted(phraseOcc)[count - 1]
    #Beware. May yield more than count amount if the treshold is 0 and more than one have 0 occurences.
    
    for i in range(len(doc)):
        if phraseOcc[i] > treshold:
            wordImportance[i] = 0
    
    return wordImportance
        

In [59]:
get_most_important_words('at around 180 years', 1)

[0, 0, 1, 0]

#### Numerical

In [60]:
def get_numerical_words(phrase):
    doc = nlp(phrase)
    
    wordImportance = [0] * len(doc)
    
    for i in range(len(doc)):
        if doc[i].like_num:
            wordImportance[i] = 1
            
    return wordImportance    

In [61]:
get_numerical_words('at around 180 years')

[0, 0, 1, 0]

In [62]:
get_numerical_words('from six to seven months')

[0, 1, 0, 1, 0]

#### General method

In [63]:
def generate_distractors_phrase(phrase, count = 4):
    pass
    doc = nlp(phrase)
    wordsCount = len(doc)
    
    #Check type of phrase
    isNumerical = any(token.like_num for token in doc)
    
    if isNumerical:
        wordImportance = get_numerical_words(phrase)
    else:
        wordImportance = get_most_important_words(phrase, 4)
        
    # Generate similar
    similarForWords = []
    
    for i in range(wordsCount):
        currSimilar = []
        
        if wordImportance[i] == 1:
            #TODO: Decide upon amount of similars per word
            currSimilar = generate_distractors(doc[i].text, 3)
        
        similarForWords.append(currSimilar)
        
    # Mix
    #TODO: Generate all possible combinatons. Order them. Take co
    
    #Two words only
    for i in range(wordsCount):
        for j in range(wordsCount):
            pass
    
    return similarForWords
    # keep formating - case sensitivity 
    # mix and match
    # relationship between mixed similar words?
    
    return wordImportance

In [64]:
similarForWords = generate_distractors_phrase('the koala inhabits north and south whales')

In [65]:
similarForWords

[[],
 ['probo', 'koalas', 'orangutan'],
 ['inhabit', 'inhabiting', 'habitats'],
 [],
 [],
 ['north', 'africa', 'korea'],
 ['whale', 'humpback', 'minke']]

## Generating all mixes

#### All combinations of mixes
- Recursively generating all possible mixes of distractors

In [54]:
def get_distractor_combinations(currPhrase, similarForWords, i):
    
    currCombinations = []
    
    for j in range(i, len(similarForWords)):
        if similarForWords[j] != []:
            for distractor in similarForWords[j]:
                newPhrase = currPhrase + ' ' + distractor
                
                newCombinations = get_distractor_combinations(newPhrase, similarForWords, j + 1)
                
                currCombinations.append(newCombinations)   
#                 currCombinations.extend(newCombinations)   
                
#                 currCombinations = [*currCombinations,*get_distractor_combinations(newPhrase, similarForWords, j + 1)]
            break
    
    if currCombinations == []:
        return currPhrase
    else:
        return currCombinations

In [79]:
similarForWords = [
    ['the'],
    ['koala', 'kangaroo'],
    ['inhabits'],
    ['north'],
    ['and'],
    ['south', 'west', 'east'],
    ['whales', 'sydney']]

currPhrase = 'the koala inhabits north and south whales'

In [80]:
get_distractor_combinations('_', similarForWords, 0)

[[[[[[['_ the koala inhabits north and south whales',
       '_ the koala inhabits north and south sydney'],
      ['_ the koala inhabits north and west whales',
       '_ the koala inhabits north and west sydney'],
      ['_ the koala inhabits north and east whales',
       '_ the koala inhabits north and east sydney']]]]],
  [[[[['_ the kangaroo inhabits north and south whales',
       '_ the kangaroo inhabits north and south sydney'],
      ['_ the kangaroo inhabits north and west whales',
       '_ the kangaroo inhabits north and west sydney'],
      ['_ the kangaroo inhabits north and east whales',
       '_ the kangaroo inhabits north and east sydney']]]]]]]

#### Flattening nested arrays

In [81]:
# https://stackoverflow.com/questions/5286541/how-can-i-flatten-lists-without-splitting-strings

def flatten(phrases):
    for x in phrases:
        if hasattr(x, '__iter__') and not isinstance(x, str):
            for y in flatten(x):
                yield y
        else:
            yield x

In [82]:
x = get_distractor_combinations('_', similarForWords, 0)
x

[[[[[[['_ the koala inhabits north and south whales',
       '_ the koala inhabits north and south sydney'],
      ['_ the koala inhabits north and west whales',
       '_ the koala inhabits north and west sydney'],
      ['_ the koala inhabits north and east whales',
       '_ the koala inhabits north and east sydney']]]]],
  [[[[['_ the kangaroo inhabits north and south whales',
       '_ the kangaroo inhabits north and south sydney'],
      ['_ the kangaroo inhabits north and west whales',
       '_ the kangaroo inhabits north and west sydney'],
      ['_ the kangaroo inhabits north and east whales',
       '_ the kangaroo inhabits north and east sydney']]]]]]]

In [83]:
list(flatten(x))

['_ the koala inhabits north and south whales',
 '_ the koala inhabits north and south sydney',
 '_ the koala inhabits north and west whales',
 '_ the koala inhabits north and west sydney',
 '_ the koala inhabits north and east whales',
 '_ the koala inhabits north and east sydney',
 '_ the kangaroo inhabits north and south whales',
 '_ the kangaroo inhabits north and south sydney',
 '_ the kangaroo inhabits north and west whales',
 '_ the kangaroo inhabits north and west sydney',
 '_ the kangaroo inhabits north and east whales',
 '_ the kangaroo inhabits north and east sydney']

#### Creating new phrase

In [84]:
# replace word
#TODO add index or start/end of word
## v1
def create_phrase(phrase, replace, replaceWith):
    return phrase.replace(replace, replaceWith)

In [85]:
create_phrase('koala is a good animal', 'good', 'great')

'koala is a great animal'

#### create phrase v2 

In [86]:
# replace word
#TODO add index or start/end of word
## v2
def create_phrase(phraseList, replaceIndex, replaceWith):
    newList = phraseList.copy()
    newList[replaceIndex] = replaceWith
    return newList

In [87]:
create_phrase(['koala', 'is', 'a', 'good', 'animal'], 3, 'great')

['koala', 'is', 'a', 'great', 'animal']

### Custom flatting function

In [88]:
def join_distractor_lists(combinations, newCombinations):    
    if any(newCombinations):
        # if single list of string is returned, bottom of recursion
        if isinstance(newCombinations[0], str):
            combinations.append(newCombinations)
        # list of lists returned
        else:
             for newCombination in newCombinations:
                combinations.append(newCombination)

    return combinations

### v2

In [89]:
def get_distractor_combinations(currPhrase, wordChoices, i = 0):
    
    currCombinations = []
    
    for j in range(i, len(wordChoices)):
        if wordChoices[j] != []:
            #TODO add correct answer in the mix as well
            for distractor in wordChoices[j]:
                newPhrase = create_phrase(currPhrase, j, distractor)
                
                # Mix correct answers as well
                newCombinations = get_distractor_combinations(newPhrase, wordChoices, j + 1)
                
                currCombinations = join_distractor_lists(currCombinations, newCombinations)
   
            break
    
    if currCombinations == []:
        return currPhrase
    else:
        return currCombinations

In [90]:
similarForWords = [[],
 ['kangaroo', 'whombat'],
 [],
 [],
 [],
 ['west', 'north'],
 ['whales', 'europe']]
currPhrase = ['the', 'koala', 'inhabits', 'north', 'and', 'south', 'australia']

In [91]:
x = get_distractor_combinations(currPhrase, similarForWords)
x

[['the', 'kangaroo', 'inhabits', 'north', 'and', 'west', 'whales'],
 ['the', 'kangaroo', 'inhabits', 'north', 'and', 'west', 'europe'],
 ['the', 'kangaroo', 'inhabits', 'north', 'and', 'north', 'whales'],
 ['the', 'kangaroo', 'inhabits', 'north', 'and', 'north', 'europe'],
 ['the', 'whombat', 'inhabits', 'north', 'and', 'west', 'whales'],
 ['the', 'whombat', 'inhabits', 'north', 'and', 'west', 'europe'],
 ['the', 'whombat', 'inhabits', 'north', 'and', 'north', 'whales'],
 ['the', 'whombat', 'inhabits', 'north', 'and', 'north', 'europe']]

##### BUG - No choices, returns [correct], rather than [[correct]]

In [93]:
similarForWords = [[],
 [],
 [],
 [],
 [],
 []]
currPhrase = ['Phascolarctidae', 'and', 'its', 'closest', 'living', 'relatives']

In [94]:
x = get_distractor_combinations(currPhrase, similarForWords)
x

['Phascolarctidae', 'and', 'its', 'closest', 'living', 'relatives']

In [95]:
def get_distractor_combinations(currPhrase, wordChoices, i = 0):
    
    currCombinations = []
    
    #If not choices bug
    if not any(wordChoices):
        currCombinations.append(currPhrase)
        return currCombinations
    
    for j in range(i, len(wordChoices)):
        if wordChoices[j] != []:
            #TODO add correct answer in the mix as well
            for distractor in wordChoices[j]:
                newPhrase = create_phrase(currPhrase, j, distractor)
                
                # Mix correct answers as well
                newCombinations = get_distractor_combinations(newPhrase, wordChoices, j + 1)
                
                currCombinations = join_distractor_lists(currCombinations, newCombinations)
   
            break
    
    if currCombinations == []:
        return currPhrase
    else:
        return currCombinations

In [96]:
similarForWords = [[],
 [],
 [],
 [],
 [],
 []]

any(similarForWords)

False

In [97]:
[currPhrase]

[['Phascolarctidae', 'and', 'its', 'closest', 'living', 'relatives']]

### Main function

In [98]:
def generate_distractors_phrase(phrase, count = 4):
    pass
    doc = nlp(phrase)
    wordsCount = len(doc)
    
    #Check type of phrase
    isNumerical = any(token.like_num for token in doc)
    
    if isNumerical:
        wordImportance = get_numerical_words(phrase)
    else:
        wordImportance = get_most_important_words(phrase, 2)
        
    # Generate similar
    similarWords = []
    
    for i in range(wordsCount):
        currSimilar = []
        
        if wordImportance[i] == 1:
            #TODO: Decide upon amount of similars per word
            currSimilar = generate_distractors(doc[i].text, 3)
        
        similarWords.append(currSimilar)
        
    #Add correct words to the mix
    for i in range(wordsCount):
        if any(similarWords[i]):
            similarWords[i].append(doc[i].text)
        
    # Mix
    phraseList = [token.text for token in doc]
    distractors = get_distractor_combinations(phraseList, similarWords)
    
    #TODO pick best distractors - remove worst(duplicates or some..)
    
    import random
    bestDistractors = random.sample(distractors, 4)

    #TODO format distractors according to case
    
    
    # relationship between mixed similar words?
    
    return wordImportance

In [99]:
x = generate_distractors_phrase('from six to seven months')
x

[0, 1, 0, 1, 0]

In [100]:
import random

def filter_distractors(distractors, doc, count):
    
    #TODO remove bad stuff and correct answer
    
    bestDistractors = random.sample(distractors, 4)

    return bestDistractors

In [101]:
def format_distractor(wordsList, doc):
    result = wordsList[0].title() if doc[0].text.istitle() else wordsList[0]
    
    for i in range(1, len(doc)):
        if not doc[i].is_punct and not doc[i - 1].is_punct:
            result += ' '
        result += wordsList[i].title() if doc[i].text.istitle() else wordsList[i]
        
    return result

In [102]:
wordsList = ['from', 'five', '-', 'eight', 'months']
doc = nlp('From four-Seven months')

In [103]:
format_distractor(wordsList, doc)

'From five-Eight months'

## Final 

In [104]:
def generate_distractors_phrase(phrase, count = 4):
    doc = nlp(phrase)
    wordsCount = len(doc)
    
    #Check type of phrase
    isNumerical = any(token.like_num for token in doc)
    
    if isNumerical:
        wordImportance = get_numerical_words(phrase)
    else:
        wordImportance = get_most_important_words(phrase, 1)
        
    # Generate similar
    similarWords = []
    
    for i in range(wordsCount):
        currSimilar = []
        
        if wordImportance[i] == 1:
            #TODO: Decide upon amount of similars per word
            currSimilar = generate_distractors(doc[i].text, 3)
        
        similarWords.append(currSimilar)
        
    #Add correct words to the mix
    for i in range(wordsCount):
        if any(similarWords[i]):
            similarWords[i].append(doc[i].text)
        
    # Mix
    phraseList = [token.text for token in doc]
    distractors = get_distractor_combinations(phraseList, similarWords)
    
    #TODO pick best distractors - remove worst(duplicates or some..)
    bestDisctractors = filter_distractors(distractors, doc, count)

    #format distractors according to case
    result = list(map(lambda distractor: format_distractor(distractor, doc), bestDisctractors)) 
        
    return result

In [105]:
generate_distractors_phrase('In the first six-seven months.')

['In the first five-seven months.',
 'In the first eight-five months.',
 'In the first eight-eight months.',
 'In the first six-eight months.']

In [106]:
generate_distractors_phrase('Botanist Robert Green.')

['Zoologist James Green.',
 'Botanist James Green.',
 'Ornithologist J. Purple.',
 'Botanist Robert Purple.']