# Using the gqa code

Tests and example code on how to use the code from the [gqa repo](https://github.com/dorarad/gqa).

The use cases are:
* create questions
* downsample the questions

In [30]:
from collections import Counter, defaultdict
import random

## Load VG data


A few files need to be loaded before `createQuestions()` can run: x2imageFiles, topFiles, vocabFiles, ..., trainkeys.txt, vg_spatial_imgsInfo.json, ...

I extended `download.sh` to also download all the vg data from version 1.4

## Create questions

With `args.create` the question generation code is executed: `gen()` -> `genQuestionRep()` -> `generateQuestion()`

## Downsample the questions

With `args.normalize` the subsampling is executed: `select()`, `unbias()`, `toRatio()`, 2x `downsampleQuestions()`, 2x `typesampleQuestions()`, remove the same questions. The downsampling is always done on the image(!!) level!

* coin: `random.random() < prob`
* select: takes the `goodIDs` for each image and increases an `outCounter` (question["group"]) and a `pretypeCounter` (question["type"]) for each qID
* unbias: takes the `ratios`
  * ratios: from `unbiasRatios()`:
    * if there is only one answer, it gets removed
    * boolean questions: get a weight of `ratios["boolean"][cond]["yes"] = min(1, (pn / py) )` if e.g. yes is more predominant
    * open questions: each answer gets weighted by smoothed / count -> infrequent answers get a score close to 1, others less than 1
    * unclear with which values it is smoothed
    * differenziation per `cond`! 
* toRatio: `ratio = tnum / fnum` which are of itself sums of counters
  * unclear what `questionSubtypes` does
  * returns this ratio as probability if > 1, else it's inverse
* downsampleQuestions: accept if `(question["group"] not in gProb) or coin(gProb[question["group"]])`, i.e. outProb = gProb of `["categoryRelS", "categoryRelO"]` etc.
  * why is it on the group level and not in the answers?
* typesampleQuestions: accept if `(question["type"] not in typeSamples) or coin(typeSamples[question["type"]])`, i.e. accept all which are not 'verify', 'choose', 'logical' and those with the specified probability
  * what???

In [27]:
# understanding unbiasRatios():

opencounter = defaultdict(lambda: defaultdict(int))
booleancounter = defaultdict(lambda: defaultdict(int))

quest_inst = [("10c-snowboards_vposition", "yes"), ("10c-snowboards_vposition", "yes"), ("10c-snowboards_vposition", "yes"), ("10c-snowboards_vposition", "yes"),
              ("10c-snowboards_vposition", "no"),
              ("14-basket_contain,o", "red"), ("14-basket_contain,o", "red"), ("14-basket_contain,o", "red"), ("14-basket_contain,o", "white"), ("14-basket_contain,o", "white"),
              ("14-basket_contain,o", "green"), ("14-basket_contain,o", "green"), ("14-basket_contain,o", "green"), ("14-basket_contain,o", "green"), ("14-basket_contain,o", "green"),]
for cond, ans in quest_inst:
    if ans in ["yes", "no"]:
        booleancounter[cond][ans] += 1
    else:
        opencounter[cond][ans] += 1

counters = {'open':opencounter,
            'boolean':booleancounter}
ratios = {"open": defaultdict(dict), "boolean": defaultdict(dict)}

smallThr = 1.
def uniformSmoother(thr):
    def usmoother(counts):
        s = sum(counts)
        # ps = [float(sum(counts[:(i+1)])) for i in range(len(counts))]
        k = 1 
        for ck in range(1,len(counts))[::-1]:
            if ((ck + 1) * counts[ck]) / s > thr:
                k = ck
                break
        newCounts = [(counts[k] if i <= k else 0) for i in range(len(counts))]
        return newCounts
    return usmoother

lsmoother = [2, 1.25, 0.03, 1.4]
lsmoother_final = [1, 1.05, 0.02, 1.38]
def customSmoother(b, gamma, gup, maxGamma): # gamma = 1.3, gup = 0.05  nts, b = 2, gamma = 1.3)
    def csmoother(counts):
    # gamma = 1.2 #1.3
        s = sum(counts)
        probs = [c/s for c in counts] 
        for i in range(len(counts)):
            # print(counts)
            if i == 0:
                continue
            s = sum(counts)
            tail = sum([c for c in counts[i:]])
            head = s - tail
            p = (min(0.1*(i+b),0.85))
            newHead = (p * tail / (1-p)) if i > 1 else 0 # (i-1) * tail # (1 - 1/i) * s
            # print((min(0.1*(i+1),0.85)), newHead, tail)
            if (sum(probs[i:]) > 0.099 or i == 1) and (head > newHead): # tail / s > 0.1
                newGamma = min(gamma + (i-1) * gup, maxGamma) # 1.38 1.5
                newProbs = [(probs[j]/sum(probs[:i])) for j in range(i)]
                for j in range(i-1)[::-1]:
                    newProbs[j] = min(newProbs[j], newGamma * newProbs[j+1])
                n = sum(newProbs)
                newProbs = [p/n for p in newProbs] 
                agamma = (1.1 if i > 1 else gamma)
                if newProbs[i-1] * newHead < agamma * counts[i]: # gamma * 
                    newHead = (agamma * counts[i]) / newProbs[i-1]
                    # print(newHead)
                for j in range(i): #[::-1]:
                    counts[j] = min(newProbs[j] * newHead, counts[j]) # max((probs[j]/sum(probs[:i])) * newHead, newGamma * counts[j+1])
        return counts
    return csmoother
smtr_uni = uniformSmoother(smallThr) 
smtr_cust = customSmoother(b = lsmoother_final[0], gamma = lsmoother_final[1], gup = lsmoother_final[2], maxGamma = lsmoother_final[3])
counters

{'open': defaultdict(<function __main__.<lambda>()>,
             {'14-basket_contain,o': defaultdict(int,
                          {'red': 3, 'white': 2, 'green': 5})}),
 'boolean': defaultdict(<function __main__.<lambda>()>,
             {'10c-snowboards_vposition': defaultdict(int,
                          {'yes': 4, 'no': 1})})}

In [29]:
smoother = smtr_cust

for cond in counters["open"]:
    ansDist = counters["open"][cond]
    answers = list(ansDist.keys()) 
    if len(answers) == 1: # and cond not in ["age", ]
        print("removedOpen", cond)            
        ratios["open"][cond][answers[0]] = min(1, float(1) / ansDist[answers[0]]) # 0.0
    else:
        sansDist = list(ansDist.items())
        sansDist = sorted(sansDist, key = lambda x: x[1], reverse = True)
        counts = [float(c) for _,c in sansDist]
        newCounts = smoother(counts)
        for i,c in enumerate(newCounts):
            print(f"{cond} at {sansDist[i][0]}: {c} / {sansDist[i][1]} = {float(c) / sansDist[i][1]}")
            ratios["open"][cond][sansDist[i][0]] = float(c) / sansDist[i][1]

ratios

14-basket_contain,o at green: 2.354 / 5 = 0.4708
14-basket_contain,o at red: 2.2 / 3 = 0.7333333333333334
14-basket_contain,o at white: 2.0 / 2 = 1.0


{'open': defaultdict(dict,
             {'14-basket_contain,o': {'green': 0.4708,
               'red': 0.7333333333333334,
               'white': 1.0}}),
 'boolean': defaultdict(dict, {})}

In [32]:
# understanding typesampleQuestions():
def coin(prob):
    if prob == 1:
        return True
    if prob == 0:
        return False
    return random.random() < prob

typeSamples = {
    "verify": 0.75,   #newAll * 0.17, #0.75, 
    "choose": 0.865,  #newAll * , #0.865, 
    "logical": 0.95,  #newAll, #0.95
}

questions = [{"type": 'verify'}, {"type": 'choose'}, {"type": 'logical'}, {'type': 'verify'}, {"type": 'verify'}, {"type": 'choose'},
             {"type": 'compare'}, {"type": 'compare'}, {"type": 'compare'}, {'type': 'query'}, {"type": 'query'}, {"type": 'query'},]

for question in questions:
    coin_throw = coin(typeSamples[question["type"]])
    if (question["type"] not in typeSamples) or coin_throw:
        print("added", question["type"], coin_throw)
    else:
        print("not added", question["type"], coin_throw)

added verify True
added choose True
added logical True
added verify True
added verify True
added choose True


KeyError: 'compare'