# 导入模块

In [5]:
import numpy as np
import pandas as pd
import re
import jieba
from collections import Counter

# 构建语料库
1. 读取数据
2. 将data转为list
3. 利用正则表达式(python re库) 去除无效字符（比如 '\t' '\n'），每个字符串变成一个个字符token
4. 将每个划分后token重新组合成字符串
5. 将list的所有字符串 组合成 一个字符串text。 将text称为 语料库

In [None]:
# 处理数据
data = pd.read_csv("./datasource/export_sql_1558435.zip", encoding='gb18030')
content = list( data.content )
content = [re.findall('[\w|\d]+', str(sentence)) for sentence in content]
content = [' '.join(word) for word in content]

text = ''
for word in content:
    text += word

# 构建一个词的词频
1. 切割预料库text,得到所有token
2. 删除掉All_tokens 中 无效字符串 。' ' , 'n'
3. 利用Counter统计每个词的词频 word_frequency
    
    word_frequency的内容是 （词， 词的总个数）

In [None]:
def cut(string):
    return jieba.lcut(string)

All_tokens = cut(TEXT1)
All_tokens = [i for i in All_tokens if i !=' ' and i!='n']
word_frequency = Counter(All_tokens)
#word_frequency.most_common(10)
number_token = len(All_tokens)

# 构建两个词组成的词组的词频
1. 将词库中每相邻的两个词结合起来
2. 利用Counter统计两个词组成的词组的词频 phrase_frequency 。用于计算 P(w1, w2)

In [None]:
length_word = len(All_tokens)
all_2gram_tokens = [ All_tokens[index] + All_tokens[index+1] for index in range(0, length_word-1)]
phrase_frequency = Counter(all_2gram_tokens)
# phrase_frequency是用于联合概率的

In [62]:
def get_pro(word, word_frequency, all_sum):
    """
    计算一个单词概率p(w1)
    """
    if word in word_frequency:
        return word_frequency[word] / all_sum
    else:
        return 1 / all_sum


In [None]:
def joint_prob(word, phrase_frequency, length_word):
    """
    计算联合概率p(w1,w2)
    """
    if word in phrase_frequency:
        return phrase_frequency[word] / length_word
    else:
        return 1 / length_word

In [None]:
def condition_prob(prior, post, word_frequency, phrase_frequency, length_word):
    """
    计算条件概率p(w2|w1)
    """
    return joint_prob(prior+post, phrase_frequency, length_word) / get_pro(prior, word_frequency, length_word)

# 1-Gram

In [65]:
def language_one_gram(string, word_frequency, all_sum):
    """
    1-gram模型  P(sentence) = P(w1, w2, w3, w4, w5 ...) ~ P(w1) * P(w2) * P(w3) ...
    """
    list_word = cut(string)
    result = 1
    for word in list_word:
        result *= get_pro(word, word_frequency, all_sum)
    return result

In [93]:
language_one_gram('大家好，enenen是hehehehehe', word_frequency, number_token)

1.6841579371481411e-31

# 2-Gram

In [91]:
def language_two_gram(string, word_frequency, phrase_frequency, length_word):
    """
    2-gram模型。即 P (sentence) = P (w1, w2, w3, w4, w5 ...) ~ P(w1) * P(w2|w1) * P(w3|w2) ......
    """
    string = cut(string)
    result = get_pro(string[0], word_frequency, length_word)
    
    for index in range(0, len(string) - 1):
        result *= condition_prob(string[index], string[index+1], word_frequency, phrase_frequency, length_word)
    return result 

In [92]:
language_two_gram("真是一只好看的小猫", word_frequency, phrase_frequency, length_word)

3.542582815457654e-16