In [1]:
navigator = """
recom = 连词 方向 动词 距离 单位
方向 = 东 | 南 | 西 | 北  
距离 = 100 | 500 | 2 | 1000
单位 = 米 | 公里 
动词 = 跑 | 走 | 跳 
连词 = 朝 | 向 | 往
"""

daily = """
activity = 人物 连词 地点 活动
人物 = 我 | 你 | 我们 | 她 | 他们
连词 = 去 | 在 
地点 = 超市 | 咖啡馆 | 购物中心 | 教室 | 医院
活动 = 买东西 | 看病 | 看书 | 跳舞 | 聊天
"""

import random
import pandas as pd
import jieba
from collections import Counter
import re

def create_grammar(grammar_str, stmt_split,  or_split):
    rules={}
    for line in grammar_str.split('\n'):
        if not line: continue
          # skip the empty line
        stmt,expr = line.split(stmt_split)
        rules[stmt.strip()] = [s.strip() for s in expr.split(or_split)]
    return rules


def generate(grammar_rule, target):    
    if target in grammar_rule: # names 
        candidates = grammar_rule[target]  # ['name names', 'name']
        candidate = random.choice(candidates) #'name names', 'name'
        return ''.join(generate(grammar_rule, target=c) for c in candidate.split() if c!='null')
    else:
        return target

def generate_n(gram_name,stmt_split,or_split,target_name,size):
    sents=[]
    sen=""
    for i in range(size):
        sen=generate(create_grammar(gram_name,stmt_split,or_split), target_name)
        sents.append(sen)
    return(sents)


def token(string):
     return re.findall('\w+', string)
    
def cut(string):
    return list(jieba.cut(string))


def prob_2(word1, word2):
    if word1 + word2 in words_count_2: 
        return words_count_2[word1+word2] / len(TOKEN_2_GRAM)
    else:
        return 1 / len(TOKEN_2_GRAM)



def get_probability(sentence):
    words=cut(sentence)
    sentence_pro=1
    for i, word in enumerate(words[:-1]):
        next_=words[i+1]
        probability= prob_2(word,next_)
        sentence_pro*=probability
    return sentence_pro


def generate_best(sents):
    sent_lib={}
    for sen in sents:
        sent_lib[sen]= get_probability(sen)
   
    result=sorted(sent_lib,key=lambda x: sent_lib[x],reverse=True)
    print('{} is the best one with probability {}'.format(result[0],sent_lib[result[0]]))
    print('{} is the worst one with probability {}'.format(result[-1],sent_lib[result[-1]]))  
 
    return(result[0],sent_lib[result[0]],result[-1],sent_lib[result[-1]])


def clean_file(train_file,clean_file):
    content = pd.read_csv(train_file, encoding='UTF-8')
    articles = content['comment'].tolist()
    articles_clean = [''.join(token(str(a)))for a in articles]
    with open(clean_file, 'w',encoding='utf-8') as f:
        for a in articles_clean:
            f.write(a + '\n')
            f.flush()
    return()

if __name__=='__main__':
    original_file='c:\\Users\\zengluci\\jupyters_and_slides\\2019-autumn\\deliverable\\movie_comments.csv'
    target_file='c:\\Users\\zengluci\\jupyters_and_slides\\2019-autumn\\deliverable\\article_clean.csv'
    clean_file(original_file,target_file)
    FILE = open(target_file,'r',encoding='UTF-8').read()
    max_length=1000
    sub_file = FILE[:max_length]
    TOKENS = cut(sub_file)
    TOKEN_2_GRAM = [''.join(TOKENS[i:i+2]) for i in range(len(TOKENS[:-2]))]
    words_count_2 = Counter(TOKEN_2_GRAM)
    generate_best(generate_n(gram_name=daily,stmt_split='=', or_split='|', target_name='activity',size=10))
    generate_best(generate_n(gram_name=navigator,stmt_split='=', or_split='|', target_name='recom',size=10))

  if (await self.run_code(code, result,  async_=asy)):
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\zengluci\AppData\Local\Temp\jieba.cache
Loading model cost 1.235 seconds.
Prefix dict has been built succesfully.


你去咖啡馆看书 is the best one with probability 5.232780885631001e-09
你在教室买东西 is the worst one with probability 9.08468903755382e-12
向东跳1000米 is the best one with probability 5.232780885631001e-09
向南跳500米 is the worst one with probability 9.08468903755382e-12
