In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
def loadRanks(trecks):
    filename = 'Data/ranks' + str(trecks) + '.txt'
    return pd.read_csv(filename, delimiter=' ', names=['QueryId', 'smth', 'document', 'tag'])

def loadMetadata(filename):
    lines = map(lambda x: x.strip().split(' '), open(filename, 'r').readlines())
    return dict(map(lambda x: (x[0], float(x[1])), lines))

def isMember(arr, to_remain):
    result, indexes = [], []
    for i, elem in enumerate(arr):
        result.append(elem in to_remain)
        if result[-1]:
            indexes.append(np.where(to_remain == elem)[0][0])
            
    return np.array(result), np.array(indexes)

def loadCode(ranks, qId, qIndex, trecks):
    metadata = loadMetadata('Data/Data/metadata' + str(trecks) + '.txt')
    
    doc_id_string = pd.read_csv('Data/Data{0}/doc_id_strings{1}.txt'.format(trecks, qIndex),
                               delimiter=' ',
                               names=['Name', 'Id'])
    
    term_doc_vars = pd.read_csv('Data/Data{0}/termvars{1}.txt'.format(trecks, qIndex),
                               delimiter=' ',
                               names=['Q1', 'Q2', 'Q3'])
    
    lengthForTerms = np.array(list(map(lambda x: int(x.strip()),
                         open('Data/Data{0}/lengthForTerms{1}.txt'.format(trecks, qIndex), 'r').readlines())))
    
    q_ranks = ranks[ranks['QueryId'] == qId]
    docs_to_retrive = q_ranks['document'].values 
    
    inds_to_retrive, _ = isMember(doc_id_string['Name'].values, docs_to_retrive)
    
    doc_id_retrive = doc_id_string[inds_to_retrive]
    ids_to_retrive = doc_id_retrive['Id'].values
    
    start = 0
    vecDelimCopy = np.ones(lengthForTerms.shape)
    matTermDocVarsCopy = None

    for i in np.arange(lengthForTerms.shape[0]):
        
        slice_to_work_with = term_doc_vars[start:start+lengthForTerms[i]]
        vec_from_slice = slice_to_work_with[slice_to_work_with.columns[0]].values
        
        inds_to_remain, _ = isMember(vec_from_slice, ids_to_retrive)
        slice_to_work_with = slice_to_work_with.iloc[inds_to_remain]
        
        if matTermDocVarsCopy is None:
            matTermDocVarsCopy = slice_to_work_with
        else:
            matTermDocVarsCopy = pd.concat([matTermDocVarsCopy, slice_to_work_with])
        vecDelimCopy[i] = len(inds_to_remain)        
        start += lengthForTerms[i]
    
    matTermDocVars = matTermDocVarsCopy
    vecDelim = vecDelimCopy.T

    return (metadata, doc_id_retrive, q_ranks, matTermDocVars, vecDelim)
    
def rewriteRanks(vecNamesDocRanks, matDocIdStr):
    vecNamesIdStr = matDocIdStr.iloc[:, 0].values
    vecIdsIdStr = matDocIdStr.iloc[:, 1].values
    
    _, indInNamesIdStr = isMember(vecNamesDocRanks, vecNamesIdStr)

    idsOfDocRanks = vecIdsIdStr[indInNamesIdStr]

    return np.sort(idsOfDocRanks)

def loadData(trecks):
    ranks = loadRanks(trecks)
    
    queries = pd.read_csv('Data/Data/queries' + str(trecks) + '.txt', 
                          delimiter='#',
                          names=['Id', 'Query'])

    modelCharacteristics = [None for i in range(queries.values.shape[0])]
    mat_doc_ranks = [[None, None, None, None, None] for i in range(queries.values.shape[0])]

    for i, query_id in enumerate(queries['Id'].values):

        print("Query : ", query_id)

        matMetaData, matDocIdStr, matDocRanks, matTermDocVars, vecDelim = loadCode(ranks, query_id, i + 1, trecks)
        
        AvDocLen = matMetaData['AverageDocumentLength']
        NumbDocs = matMetaData['NumberOfDocuments']

        ulabel, uindex = np.unique(matTermDocVars[matTermDocVars.columns[0]], return_inverse=True)
        modelCharacteristics[i] = [ulabel, uindex]
        
        xvars = matTermDocVars.iloc[:,1].values * np.log10((AvDocLen + matTermDocVars.iloc[:, 2].values) / 
                                                           matTermDocVars.iloc[:, 2].values)
        yvars = vecDelim / NumbDocs
        
        yvarsExt_ed = np.repeat(yvars, vecDelim.astype(np.int32))
        modelCharacteristics[i].append([xvars, yvarsExt_ed])
        
        for j in range(4):
            mat_doc_ranks[i][j] = matDocRanks.iloc[:, j]
        
        vecDocNamesEv  = matDocRanks.iloc[:, 2].values
        vecDocRanksEv  = matDocRanks.iloc[:, 3].values
        mat_doc_ranks[i][2]  = vecDocNamesEv[vecDocRanksEv == 1]
        mat_doc_ranks[i][4] = rewriteRanks(mat_doc_ranks[i][2], matDocIdStr)
    stop = True
    return (mat_doc_ranks, queries, modelCharacteristics, )


In [10]:
doc_ranks, queries, query_characteristics = loadData(6)

FileNotFoundError: File b'Data/Data/queries6.txt' does not exist

In [4]:
from enum import Enum
import numpy as np

class Domain:
    def __init__(self, low, high):
        self.low = min(low, high)
        self.high = max(low, high)
    
    def contain(self, nDomain):
        return self.low <= nDomain.low and nDomain.high <= self.high

class DOMAINS:
    REAL = Domain(-np.inf, np.inf)
    POSITIVE = Domain(0, np.inf)
    NEGATIVE = Domain(-np.inf, 0)
    TWICE_REAL = (REAL, REAL)


class DomainException(Exception):
    pass

class Primitive:

    def __init__(self, func, valency, domain, codomain, string):
        self.func = func
        self.valency = valency
        self.nodes = []
        self.domain = domain
        self.codomain = codomain
        self.str = string

    def add_nodes(self, nodes):
        self.nodes = nodes
        return self

    def calc(self, x, y):
        if self.valency == 0:
            return self.func(x, y)
        return self.func(*[node.calc(x, y) for node in self.nodes])

    def calc_domains(self):
        nodes_domains = [node.calc_domains() for node in self.nodes]

        if self.valency == 0:
            pass
        elif self.valency == 1:
            if not self.domain.contain(nodes_domains[0]):
                raise DomainException()
            self.domain = nodes_domains[0]
            self.codomain = Domain(self.func(self.domain.low), self.func(self.domain.high))

        elif self.valency == 2:
            
            # TODO избавиться от switch
            fr, sc = nodes_domains

            if self.func == np.add:
                self.codomain = Domain(fr.low + sc.low, fr.high + sc.high)
            elif self.func == np.subtract:
                self.codomain = Domain(fr.low - sc.high, fr.high - sc.low)
            elif self.func == np.multiply:
                vars = np.array([fr.low * sc.low, fr.low * sc.high, fr.high * sc.low, fr.high * sc.high])
                vars = vars[vars == vars] # TODO bug
                self.codomain = Domain(np.min(vars), np.max(vars))

            elif self.func == np.divide:
                
                # TODO сделать честное вычисление

                vars = np.array([fr.low * sc.low, fr.low * sc.high, fr.high * sc.low, fr.high * sc.high])
                vars = vars[vars == vars] # TODO bug
                tmin, tmax = np.min(vars), np.max(vars)
                if tmin < 0 and tmax > 0:
                    self.codomain = DOMAINS.REAL
                elif tmax <= 0:
                    self.codomain = DOMAINS.NEGATIVE
                elif tmin >= 0:
                    self.codomain = DOMAINS.POSITIVE
                else:
                    raise Exception('Comparison failed')
            else:
                raise Exception('Undefined function')
        else:
            raise Exception('Undefined valency')

        return self.codomain

    def get_tokens(self):
        if self.valency == 0:
            return 1
        return 1 + np.sum([node.get_tokens() for node in self.nodes])
    
    def get_kth(self, n):
        i = 0
        while self.nodes[i].get_tokens() <= n:
            n -= self.nodes[i].get_tokens()
            i += 1
        if n == 0:
            return self, i
        return self.nodes[i].get_kth(n - 1)
        
    def get_random(self):
        rnd = np.random.randint(0, self.get_tokens() - 1)
        return self.get_kth(rnd)
    
    def __str__(self):
        nodes_names = [str(node) for node in self.nodes]
        if self.valency == 0:
            return self.str
        elif self.valency == 1:
            return self.str + '(' + nodes_names[0] + ')'
        elif self.valency == 2:
            return self.str + '(' + nodes_names[0] + ', ' + nodes_names[1] + ')'

class Primitives:
    TF = Primitive(lambda x, y: x, 0, DOMAINS.POSITIVE, DOMAINS.POSITIVE, 'tf')
    IDF = Primitive(lambda x, y: y, 0, DOMAINS.POSITIVE, DOMAINS.POSITIVE, 'idf')
    ADD = Primitive(np.add, 2, DOMAINS.TWICE_REAL, DOMAINS.TWICE_REAL, 'add')
    SUB = Primitive(np.subtract, 2, DOMAINS.TWICE_REAL, DOMAINS.TWICE_REAL, 'substract')
    MUL = Primitive(np.multiply, 2, DOMAINS.TWICE_REAL, DOMAINS.TWICE_REAL, 'multiply')
    DIV = Primitive(np.divide, 2, DOMAINS.TWICE_REAL, DOMAINS.TWICE_REAL, 'divide')
    LOG = Primitive(np.log, 1, DOMAINS.POSITIVE, DOMAINS.REAL, 'log')
    EXP = Primitive(np.exp, 1, DOMAINS.REAL, DOMAINS.POSITIVE, 'exp')
    SQRT = Primitive(np.sqrt, 1, DOMAINS.POSITIVE, DOMAINS.POSITIVE, 'sqrt')
    
PRIMITIVES = [ 
    Primitives.TF,
    Primitives.IDF,
    Primitives.ADD,
    Primitives.SUB,
    Primitives.MUL,
    Primitives.DIV,
    Primitives.LOG,
    Primitives.EXP,
    Primitives.SQRT,
]

In [5]:
from copy import deepcopy

def create_random_model(depth):
    depth -= np.random.randint(0, 2) # construct not uniform binary trees

    if depth <= 0:
        return deepcopy(PRIMITIVES[np.random.randint(0, 2)]) # select randomly TF or IDF
    else:
        cur_primitive = deepcopy(PRIMITIVES[np.random.randint(0, len(PRIMITIVES))])
        nodes = [create_random_model(depth - 1) for _ in range(cur_primitive.valency)]
        cur_primitive.add_nodes(nodes)
        return cur_primitive


def create_population(size, max_depth):
    population = []
    for _ in range(size):
        model = create_random_model(max_depth)
        try:
            model.calc_domains()
            population.append(model)
        except DomainException as e:
            print(e)
            pass
        
    return population

In [6]:

def evaluate_model(model, query_characteristics):
    values = model.calc(*query_characteristics[2])
    result = np.zeros(query_characteristics[0].shape[0])
    for i, pos in enumerate(query_characteristics[1]):
        result[pos] += values[i]

    return [query_characteristics[0], result]

def evaluate_quality(query_id, doc_related, doc_ranks):
    
    vecDocIdEv = doc_ranks[4]
    indsWhichAppear, _ = isMember(doc_related[0], vecDocIdEv)
    doc_related.append(indsWhichAppear)

    sort_indexes = np.argsort(doc_related[1])[::-1]

    doc_related = np.array(doc_related)
    doc_related = doc_related[:, sort_indexes]

    ranksForRetrievedDocs = doc_related[2]
    cumRanksForRetrievedDocs = np.cumsum(ranksForRetrievedDocs)
    cutOffPrecision = cumRanksForRetrievedDocs / np.arange(1, ranksForRetrievedDocs.shape[0] + 1)
    qualValue = np.sum(cutOffPrecision * ranksForRetrievedDocs) / doc_ranks[2].shape[0]
    
    return qualValue

def get_quality(model, doc_ranks, queries, query_characteristics):

    vec_quality = []
    for i, query_id in enumerate(queries['Id'].values):
        doc_related = evaluate_model(model, query_characteristics[i])
        quality = evaluate_quality(query_id, doc_related, doc_ranks[i])
        vec_quality.append(quality)
    quality = np.sum(vec_quality)

    # TODO add regularization 
    return quality

In [7]:
def learn_population(population, doc_ranks, queries, model_charactericts):   

    qualities = []
    for model in population:
        quality = get_quality(model, doc_ranks, queries, model_charactericts)
        if quality != quality or abs(quality) == np.inf:
            quality = 0
            
        qualities.append(quality)

    return qualities

In [8]:
population = create_population(100, 10)







































In [9]:
learn_population(population, doc_ranks, queries, query_characteristics)

NameError: name 'doc_ranks' is not defined

In [19]:
def mutate_rand_tree(x):
    if x.get_tokens() <= 2:
        return x
    x_ = deepcopy(x)
    n1, id1 = x_.get_random()
    n1.nodes[id1] = create_random_model(3)
    return x_

In [20]:
def crossfit(x, y):
    if x.get_tokens() <= 2 or y.get_tokens() <= 2:
        return x
    x_ = deepcopy(x)
    y_ = deepcopy(y)
    n1, id1 = x_.get_random()
    n2, id2 = y_.get_random()
    n1.nodes[id1], n2.nodes[id2] = n2.nodes[id2], n1.nodes[id1]
    return x_

In [21]:
reit_ = 1e9
last_reit = 1e9
SIMILARITY = 1e-5

for iteration in range(1000):
    new_population = []
    
    sz = len(population)
    
    new_population.append(population[0])
    
    for i in range(sz // 30):
        new_population.append(mutate_rand_tree(population[0]))
        new_population.append(mutate_rand_tree(population[1]))
        new_population.append(mutate_rand_tree(population[2]))
    for i in range(sz // 10):
        new_population.append(create_random_model(4))
    for i in range(sz):
        t1 = population[np.random.randint(0, sz / 2)]
        t2 = population[np.random.randint(0, sz / 2)]
        new_population.append(mutate_rand_tree(crossfit(t1, t2)))
    
    population = np.array(population + new_population)
    values = learn_population(population, doc_ranks, queries, query_characteristics)
    values = -np.array(values)
    
    indexes = np.arange(len(population))
    indexes = sorted(indexes, key=lambda i: values[i])
    
    population = population[indexes]
    values = values[indexes]
    
    ids = []
    for i in range(20):
        for q in range(i + 1, 20):
            if values[i] - values[q] < SIMILARITY:
                ids.append(q)
                
    ids = np.unique(ids)
    population = [population[i] for i in range(100 + len(ids)) if i not in ids]
    
    print(-values[0])

  


12.97688689023271




13.643119583882207




13.699753885846793
13.699753885846793
13.699753885846793
13.699753885846793
13.699753885846793
13.699753885846793
13.699753885846793
13.700005043200493
13.700247822367583
13.700247822367583
13.700941903499787
13.700941903499787
13.700941903499787
13.80013948679012
13.80013948679012
13.80013948679012
13.80013948679012
13.80013948679012


  


13.80013948679012
13.80013948679012
13.80013948679012
13.80013948679012
13.80013948679012
13.80013948679012
13.80013948679012
13.80013948679012
13.80013948679012


KeyboardInterrupt: 

In [23]:
str(population[0])

'exp(divide(tf, exp(sqrt(tf))))'

In [24]:
str(population[1])

'exp(idf)'

In [25]:
learn_population([population[0]], doc_ranks, queries, query_characteristics)

[13.80013948679012]

In [22]:
learn_population(population, doc_ranks, queries, query_characteristics)



[13.80013948679012,
 11.67598795595149,
 11.67598795595149,
 11.67598795595149,
 11.67598795595149,
 11.67598795595149,
 11.67598795595149,
 11.67598795595149,
 11.67598795595149,
 11.67598795595149,
 11.67598795595149,
 11.67598795595149,
 11.67598795595149,
 11.67598795595149,
 11.67598795595149,
 11.67598795595149,
 11.67598795595149,
 11.67598795595149,
 11.67598795595149,
 11.67598795595149,
 11.67598795595149,
 11.67598795595149,
 11.67598795595149,
 11.67598795595149,
 11.67598795595149,
 11.67598795595149,
 11.67598795595149,
 11.67598795595149,
 11.67598795595149,
 11.67598795595149,
 11.67598795595149,
 11.67598795595149,
 11.67598795595149,
 11.67598795595149,
 11.67598795595149,
 11.67598795595149,
 11.67598795595149,
 11.67598795595149,
 11.67598795595149,
 11.67598795595149,
 11.67598795595149,
 11.67598795595149,
 11.67598795595149,
 11.67598795595149,
 11.67598795595149,
 11.67598795595149,
 11.67598795595149,
 11.67598795595149,
 11.67598795595149,
 11.67598795595149,


In [15]:
tmodel = deepcopy(Primitives.SQRT).add_nodes([
    deepcopy(Primitives.SQRT).add_nodes([
        deepcopy(Primitives.EXP).add_nodes([
            deepcopy(Primitives.TF)
        ])
    ])
])
print(str(tmodel))
learn_population([tmodel], doc_ranks, queries, query_characteristics)

sqrt(sqrt(exp(tf)))


[10.600929915956014]

In [35]:
tmodel = deepcopy(PRIMITIVES[7]).add_nodes([
    deepcopy(PRIMITIVES[8]).add_nodes([
        deepcopy(PRIMITIVES[6]).add_nodes([
            deepcopy(PRIMITIVES[5]).add_nodes([
                deepcopy(PRIMITIVES[0]),
                deepcopy(PRIMITIVES[1])
            ])
        ])
    ])
])
print(str(tmodel))
learn_population([tmodel], doc_ranks, queries, query_characteristics)

exp(sqrt(log(divide(tf, idf))))




[13.696360763323469]