# 1. Rule Based: Syntax Tree

In [2]:
import random

In [3]:
simple_grammar = """
sentence => noun_phrase verb_phrase
noun_phrase => Article Adj* noun
Adj* => null | Adj Adj*
verb_phrase => verb noun_phrase
Article =>  一个 | 这个
noun =>   女人 |  篮球 | 桌子 | 小猫
verb => 看着   |  坐在 |  听着 | 看见
Adj =>  蓝色的 | 好看的 | 小小的
"""

In [4]:
def create_grammar(grammar_str, split='=>', line_split='\n'):
    grammar = {}
    for line in grammar_str.split(line_split):
        if not line.strip(): continue
        exp, stmt = line.split(split)
        grammar[exp.strip()] = [s.split() for s in stmt.split('|')]
    return grammar

In [5]:
grammar = create_grammar(simple_grammar)

In [6]:
grammar

{'sentence': [['noun_phrase', 'verb_phrase']],
 'noun_phrase': [['Article', 'Adj*', 'noun']],
 'Adj*': [['null'], ['Adj', 'Adj*']],
 'verb_phrase': [['verb', 'noun_phrase']],
 'Article': [['一个'], ['这个']],
 'noun': [['女人'], ['篮球'], ['桌子'], ['小猫']],
 'verb': [['看着'], ['坐在'], ['听着'], ['看见']],
 'Adj': [['蓝色的'], ['好看的'], ['小小的']]}

In [7]:
def generate(gram, target):
    if target not in gram: return target
    expaned = [generate(gram, t) for t in random.choice(gram[target])]
    return ''.join([e for e in expaned if e != 'null'])

In [8]:
for i in range(10):
    print(generate(gram=create_grammar(simple_grammar), target='sentence'))

一个桌子坐在一个篮球
这个女人坐在这个蓝色的好看的篮球
一个桌子看着一个小猫
这个好看的小小的蓝色的小小的桌子坐在这个好看的蓝色的好看的桌子
这个桌子坐在一个小猫
一个小小的蓝色的女人看着这个小猫
这个女人听着这个蓝色的篮球
这个小猫听着一个小小的桌子
这个女人看见一个桌子
这个小猫看着这个女人


# 2. Probability Based: Language Model

In [25]:
import pandas as pd
import re, jieba
from collections import Counter

In [26]:
data = pd.read_csv('sqlResult.csv', encoding='gb18030')

In [27]:
data.head(10)

Unnamed: 0,id,author,source,content,feature,title,url
0,89617,,快科技@http://www.kkj.cn/,此外，自本周（6月12日）起，除小米手机6等15款机型外，其余机型已暂停更新发布（含开发版/...,"{""type"":""科技"",""site"":""cnbeta"",""commentNum"":""37""...",小米MIUI 9首批机型曝光：共计15款,http://www.cnbeta.com/articles/tech/623597.htm
1,89616,,快科技@http://www.kkj.cn/,骁龙835作为唯一通过Windows 10桌面平台认证的ARM处理器，高通强调，不会因为只考...,"{""type"":""科技"",""site"":""cnbeta"",""commentNum"":""15""...",骁龙835在Windows 10上的性能表现有望改善,http://www.cnbeta.com/articles/tech/623599.htm
2,89615,,快科技@http://www.kkj.cn/,此前的一加3T搭载的是3400mAh电池，DashCharge快充规格为5V/4A。\r\n...,"{""type"":""科技"",""site"":""cnbeta"",""commentNum"":""18""...",一加手机5细节曝光：3300mAh、充半小时用1天,http://www.cnbeta.com/articles/tech/623601.htm
3,89614,,新华社,这是6月18日在葡萄牙中部大佩德罗冈地区拍摄的被森林大火烧毁的汽车。新华社记者张立云摄\r\n,"{""type"":""国际新闻"",""site"":""环球"",""commentNum"":""0"",""j...",葡森林火灾造成至少62人死亡 政府宣布进入紧急状态（组图）,http://world.huanqiu.com/hot/2017-06/10866126....
4,89613,胡淑丽_MN7479,深圳大件事,（原标题：44岁女子跑深圳约会网友被拒，暴雨中裸身奔走……）\r\n@深圳交警微博称：昨日清...,"{""type"":""新闻"",""site"":""网易热门"",""commentNum"":""978"",...",44岁女子约网友被拒暴雨中裸奔 交警为其披衣相随,http://news.163.com/17/0618/00/CN617P3Q0001875...
5,89612,张怡,中国证券报?中证网,受到A股被纳入MSCI指数的利好消息刺激，A股市场从周三开始再度上演龙马行情，周四上午金...,"{""type"":""市场"",""site"":""中证网"",""commentNum"":""0"",""jo...",金融股一枝独秀 配置价值犹存,http://www.cs.com.cn/gppd/201706/t20170623_533...
6,89611,,威锋网@http://www.feng.com/,虽然至今夏普智能手机在市场上无法排得上号，已经完全没落，并于 2013 年退出中国市场，但是...,"{""type"":""科技"",""site"":""cnbeta"",""commentNum"":""21""...",配骁龙660 全面屏鼻祖夏普新机酝酿中,http://www.cnbeta.com/articles/tech/623603.htm
7,89610,申玉彬 整理,中国证券报?中证网,沙漠雄鹰：震荡有利消化套牢筹码\r\n 周四开盘上证50在银行券商大蓝筹带动下一度涨近...,"{""type"":""市场"",""site"":""中证网"",""commentNum"":""0"",""jo...",博友早评：震荡有利消化套牢筹码,http://www.cs.com.cn/gppd/201706/t20170623_533...
8,89609,李杭_BJS4645,荆楚网-楚天都市报,（原标题：武汉警方一下子抓了808人，还都是俊男靓女！原来他们每天偷偷摸摸干这事！）\r\n...,"{""type"":""新闻"",""site"":""网易热门"",""commentNum"":""1600""...",武汉千余警察出动 抓获808名俊男靓女全是诈骗犯,http://news.163.com/17/0614/14/CMT9N8G80001899...
9,89608,吴瞬,中国证券报?中证网,6月21日，A股纳入MSCI指数尘埃落定，但当天被寄予厚望的券商股并未扛起反弹大旗。22...,"{""type"":""市场"",""site"":""中证网"",""commentNum"":""0"",""jo...",纳入MSCI指数 A股长期配置价值提升,http://www.cs.com.cn/gppd/201706/t20170623_533...


In [28]:
len(data)

89611

In [37]:
content_clean = [''.join(re.findall('\w+',str(c))) for c in data['content']]

In [38]:
words = []
for c in content_clean:
    words += jieba.cut(c)

Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\ZP\AppData\Local\Temp\jieba.cache
Loading model cost 0.770 seconds.
Prefix dict has been built successfully.


In [39]:
words_count = Counter(words)
words_count.most_common(100)

[('的', 703716),
 ('n', 382020),
 ('在', 263597),
 ('月', 189330),
 ('日', 166300),
 ('新华社', 142462),
 ('和', 134061),
 ('年', 123106),
 ('了', 121938),
 ('是', 100909),
 ('１', 88187),
 ('０', 84945),
 ('外代', 83268),
 ('中', 73926),
 ('中国', 71179),
 ('２', 70521),
 ('2017', 69894),
 ('记者', 62147),
 ('二线', 61998),
 ('将', 61420),
 ('与', 58309),
 ('等', 58162),
 ('为', 57019),
 ('5', 54578),
 ('照片', 52271),
 ('4', 51626),
 ('对', 50317),
 ('上', 47452),
 ('也', 47401),
 ('有', 45767),
 ('５', 40857),
 ('说', 39017),
 ('发展', 37632),
 ('他', 37194),
 ('３', 36906),
 ('以', 36867),
 ('国际', 35842),
 ('nn', 35330),
 ('４', 34659),
 ('比赛', 32232),
 ('６', 30575),
 ('到', 30109),
 ('人', 29572),
 ('从', 29489),
 ('6', 29002),
 ('都', 28027),
 ('不', 27963),
 ('后', 27393),
 ('当日', 27186),
 ('就', 26684),
 ('并', 26568),
 ('国家', 26439),
 ('７', 26386),
 ('企业', 26147),
 ('进行', 25987),
 ('3', 25491),
 ('美国', 25485),
 ('举行', 25389),
 ('被', 25277),
 ('北京', 25245),
 ('体育', 24873),
 ('2', 24376),
 ('1', 24182),
 ('这', 24118),
 ('新', 2

In [41]:
def prob_1(word):
    return words_count[word] / len(words)

19982

In [43]:
words_2_gram = [''.join(words[i:i+2]) for i in range(len(words[:-1]))]

In [46]:
words_count_2 = Counter(words_2_gram)

In [47]:
def prob_2(word1, word2):
    if word1 + word2 in words_count_2: return words_count_2[word1+word2] / len(words_2_gram)
    else:
        return 1 / len(words_2_gram)

In [57]:
def probablity(sentence):
    sentence_words = list(jieba.cut(sentence))
    sentence_pro = 1
    
    for i, word in enumerate(sentence_words[:-1]):
        next_word = sentence_words[i+1]
        probability = prob_2(word, next_word)
        sentence_pro *= probability
    
    return sentence_pro

In [58]:
probablity('我们今天抽奖抽到一台苹果手机')

6.826145844334016e-41

In [60]:
for sen in [generate(gram=create_grammar(simple_grammar), target='sentence') for i in range(10)]:
    print('sentence: {} with Prb: {}'.format(sen, probablity(sen)))

sentence: 这个女人看着这个桌子 with Prb: 2.5422494196088368e-28
sentence: 这个小小的小猫看着一个蓝色的女人 with Prb: 4.3368129919794165e-47
sentence: 这个女人看着这个好看的蓝色的桌子 with Prb: 2.6175280051580948e-51
sentence: 一个蓝色的小小的桌子听着一个桌子 with Prb: 3.7302691414386515e-53
sentence: 一个蓝色的女人看见一个女人 with Prb: 3.0407376942942425e-39
sentence: 一个桌子看着一个桌子 with Prb: 1.059270591503682e-29
sentence: 这个女人看着这个好看的蓝色的好看的桌子 with Prb: 4.089178153264865e-64
sentence: 一个好看的篮球看见一个小猫 with Prb: 3.044874752381718e-40
sentence: 这个桌子看着这个女人 with Prb: 2.5422494196088368e-28
sentence: 这个好看的好看的好看的蓝色的桌子看见一个蓝色的蓝色的篮球 with Prb: 1.0505561048606603e-99


In [63]:
need_compared = [
    "今天晚上请你吃大餐，我们一起吃日料 明天晚上请你吃大餐，我们一起吃苹果",
    "真事一只好看的小猫 真是一只好看的小猫",
    "今晚我去吃火锅 今晚火锅去吃我",
    "洋葱奶昔来一杯 养乐多绿来一杯"
]

for s in need_compared:
    s1, s2 = s.split()
    p1, p2 = probablity(s1), probablity(s2)
    
    better = s1 if p1 > p2 else s2
    
    print('{} is more possible'.format(better))
    print('-'*4 + ' {} with probility {}'.format(s1, p1))
    print('-'*4 + ' {} with probility {}'.format(s2, p2))

今天晚上请你吃大餐，我们一起吃日料 is more possible
---- 今天晚上请你吃大餐，我们一起吃日料 with probility 1.9877949356148648e-66
---- 明天晚上请你吃大餐，我们一起吃苹果 with probility 1.590235948491892e-66
真是一只好看的小猫 is more possible
---- 真事一只好看的小猫 with probility 4.641088649507633e-34
---- 真是一只好看的小猫 with probility 8.135198142748278e-27
今晚我去吃火锅 is more possible
---- 今晚我去吃火锅 with probility 3.453568946204909e-20
---- 今晚火锅去吃我 with probility 5.508207075819147e-28
养乐多绿来一杯 is more possible
---- 洋葱奶昔来一杯 with probility 1.8567574979596284e-22
---- 养乐多绿来一杯 with probility 3.254643746255006e-15
