In [11]:
import synonyms as syn
import json
import operator

In [41]:
class Retrieval:
    Document_Key = "d"
    Conversation_Key = "c"
    __answer_sheet = {
        'd0': '进水解决方案 ... 买过',
        'd1': 'some stories',
        'c0': '你好',
        'c1': '早上好'
    }
    def __init__(self, dataset=None, household_d=0.9, household_c=0.9):
        if dataset is not None and type(dataset) is not dict:
            raise TypeError("must be initialized by a object of dict type")
        if dataset is not None:
            if self.Document_Key not in dataset or self.Conversation_Key not in dataset:
                raise ValueError("the object must contain both document key and conversation key")
            self.docs = dataset[self.Document_Key]
            self.convs = dataset[self.Conversation_Key]
        self.household_doc = household_d
        self.household_conv = household_c
    def setDataset(self, dataset):
        if type(dataset) is not dict:
            raise TypeError("must be initialized by a object of dict type")
        self.docs = dataset[self.Document_Key]
        self.convs = dataset[self.Conversation_Key]
    def __mean(self, numbers):
        return float(sum(numbers)) / max(len(numbers), 1)
    
    def __max_id(self, items):
        return max(items.iteritems(), key=operator.itemgetter(1))[0]

    def evalQuery(self, q):
        if self.docs is None or self.convs is None:
            raise ValueError("setDataset shall be called first")
        if type(q) is not str and type(q) is not unicode:
            raise TypeError("query shall be type string")
        query_words = syn.seg(q)[0]
        
        # syn.compare(sen1, sen2, seg=False)
        doc_scores = {}
        conv_scores = {}
        self.doc_matches = {}
        self.conv_matches = {}
        for doc_id, keys in self.docs.iteritems():
            matches = [[syn.compare(keys[i], query_words[j], seg=False) for j in range(len(query_words))] for i in range(len(keys))]
            self.doc_matches[doc_id] = matches
            max_matches = [max(matches[i]) for i in range(len(matches))]
            score = self.__mean(max_matches)
            doc_scores[doc_id] = score    
        doc_id_best = self.__max_id(doc_scores)
        
        for conv_id, conv_s in self.convs.iteritems():
            matches = [syn.compare(conv_s, query_words[j], seg=False) for j in range(len(query_words))]
            self.conv_matches[conv_id] = matches
            score = max(matches)
            conv_scores[conv_id] = score
        conv_id_best = self.__max_id(conv_scores)
        
        #print doc_scores[doc_id_best], conv_scores[conv_id_best]
        if  doc_scores[doc_id_best] > self.household_doc:
            if conv_scores[conv_id_best] > self.household_conv: 
                return {self.Document_Key: doc_id_best, self.Conversation_Key: conv_id_best}
            return {self.Document_Key: doc_id_best}
        else :
            if conv_scores[conv_id_best] > self.household_conv:
                return {self.Conversation_Key: conv_id_best}
        return {}
    
    def evalQuerys(self, qs):
        if type(qs) is not list and type(qs) is not str and type(qs) is not unicode:
            raise TypeError("the argument shall be a list object constituted by strings or a string")
        if type(qs) is str or type(qs) is unicode:
            qs = [qs]
        return [self.evalQuery(qs[i]) for i in range(len(qs))]
    
    def __ans(self, qs):
        ans_arr = self.evalQuerys(qs)
        ans_ss = []
        for i in range(len(qs)):
            ans = ans_arr[i]
            ans_s = ''
            if self.Conversation_Key in ans:
                ans_s += self.__answer_sheet[ans[self.Conversation_Key]]
            if self.Document_Key in ans:
                if ans_s is not '':
                    ans_s += ','
                ans_s += self.__answer_sheet[ans[self.Document_Key]]
            else :
                if ans_s is not '':
                    ans_s += ','
                ans_s += 'There is nothing i can help you, Sorry'
            ans_ss.append(ans_s)
        return {'a': ans_ss}

In [43]:
class HttpRequestParser:
    def __init__(self):
        self.reset()
    def reset(self):
        self.header = {}
        self.data = ''
        self.wait = ''
        self.got = False
    def check(self):
        CL = int(self.header['Content-Length:'])
        if len(self.data) == CL:
            return True
        if len(self.data) > CL:
            raise ValueError('unexpected data')
        return False
    def feed(self, data):
        if not self.got:
            data = self.wait + data
            while 1:
                pos = data.find('\r\n')
                if pos < 0:
                    self.wait = data
                    return False
                elif pos > 0:
                    mes = data[0:pos].split(' ')
                    self.header[mes[0]] = mes[1]
                    data = data[pos+2:]
                else:
                    self.got = True
                    self.data = data[pos+2:]
                    break
        else:
            self.data += data
        return self.check()

In [3]:
import socket
import sys

HOST = ''
PORT = 44442

s = socket.socket(
    socket.AF_INET, socket.SOCK_STREAM
)

try:
    s.bind((HOST, PORT))
except socket.error as msg:
    print 'Bind failed. Error Code: ' + str(msg[0]) + ' Message ' + msg[1]
    sys.exit()
s.listen(5)
print 'Socket listening up to 5'

Socket listening up to 5


In [49]:
parser = HttpRequestParser()
ret = Retrieval()
while 1:
    conn, addr = s.accept()
    while 1:
        print 'Collecting fragment'
        frag = conn.recv(4096)
        if parser.feed(frag): break
    print 'Connected with ' + addr[0] + ':' + str(addr[1])
    dataset = json.loads(parser.data)
    parser.reset()
    query = dataset['q']
    ret.setDataset(dataset)
    res_data = json.dumps(ret._Retrieval__ans(query))
    res_header = 'HTTP/1.1 200 OK\r\nContent-Length: ' + str(len(res_data)) + '\r\nConnection: close\r\n'
    res = res_header + '\r\n' + res_data
    conn.sendall(res)
    conn.close()


KeyboardInterrupt: 