In [1]:
from nltk.corpus import brown

brown.categories() brown语料库文本分类

In [2]:
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

brown.sents()句子

In [5]:
brown.sents()

[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ...]

In [6]:
len(brown.sents()) # 句子总数

57340

In [9]:
brown.words()#包含的单词列表

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]

In [7]:
len(brown.words())#单词总数

1161192

In [8]:
brown.words()

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]

#### 文本处理流程

文档导入-->文本预处理-->分词-->特征提取-->ML-->Label/Targets

#### Tokenize分词

In [11]:
import nltk

In [12]:
sentence = "hello, world"
tokens = nltk.word_tokenize(sentence)
tokens

['hello', ',', 'world']

#### 分词的两种方式: 启发式Heuristic 和机器学习/统计方法:HMM,CRF

启发式: 如 今天天气不错--> 今天/天天/天气/不错
统计方法: 是依据统计方法统计词出现频率来分词.

#### 中英文分词差异

中文分词

#### 中文分词的三种模式: 全模式,精确模式,搜索模式

In [None]:
import jieba

seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
print("Full Mode:", "/".join(seg_list)) #全模式
seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
print("False Mode:", "/".join(seg_list)) #精确模式
seg_list = jieba.cut_for_search("我来到北京清华大学")
print("Search Mode:", "/".join(seg_list)) #全模式

In [16]:
#Full Mode: 我/来到/北京/清华/清华大学/华大/大学
#False Mode: 我/来到/北京/清华大学
#Search Mode: 我/来到/北京/清华/华大/大学/清华大学

社交网络语言的tokenize

#### 正则表达式
对照表http://www.regexlab.com/zh/regref.htm

In [49]:
import re
import nltk
from nltk.tokenize import word_tokenize


tweet = 'RT @angelababy: love you baby! :D http://ah.love #168cm'
#print(word_tokenize(tweet))

emoticons_str = r"""
    (?:
            [:=;] # 眼睛
            [oO\-]? # ⿐鼻⼦子
            [D\)\]\(\]/\\OpP] # 嘴
    )"""
regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @某⼈人
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # 话题标签
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+',# URLs
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # 数字
    r"(?:[a-z][a-z'\-_]+[a-z])", # 含有 - 和 ‘ 的单词
    r'(?:[\w_]+)', # 其他
    r'(?:\S)' # 其他
]
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
def tokenize(s):
    return tokens_re.findall(s)
def preprocess(s, lowercase=False):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens

print(preprocess(tweet))

['RT', '@angelababy', ':', 'love', 'you', 'baby', '!', ':D', 'http://ah.love', '#168cm']


#### 纷繁复杂的词形

Inflection变化: walk => walking => walked
不影响词性

derivation 引申: nation (noun) => national (adjective) => nationalize (verb)
影响词性

词形归⼀化

##### stem词干提取

Stemming 词⼲提取：⼀般来说，就是把不影响词性的inflection的⼩尾巴砍掉
walking 砍ing = walk
walked 砍ed = walk

In [60]:
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import SnowballStemmer

p = PorterStemmer()
l = LancasterStemmer()
s = SnowballStemmer(language="english") # 这个必须指定语言language
words = ['maximum','presumably','multiply','provision']
for word in words:
    print(p.stem(word))
    print(l.stem(word))
    print(s.stem(word))

maximum
maxim
maximum
presum
presum
presum
multipli
multiply
multipli
provis
provid
provis


#### lemmatization词形归一

Lemmatization 词形归⼀：把各种类型的词的变形，都归为⼀个形式
went 归⼀ = go
are 归⼀ = be

In [63]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
words = ['dogs','churches','aardwolves','abaci','hardrock']
for word in words:
    print(wordnet_lemmatizer.lemmatize(word))

dog
church
aardwolf
abacus
hardrock


#### Lemma的小问题

Went 有可能是go的过去式;也有可能是温特的名字
因此,更好地实现Lemma词形归一需要运用到时态

In [65]:
# 没有POS Tag,默认是NN名词
wordnet_lemmatizer.lemmatize("are")

'are'

In [66]:
wordnet_lemmatizer.lemmatize("is")

'is'

In [67]:
# 加上POS Tag
wordnet_lemmatizer.lemmatize("are", pos='v')

'be'

In [69]:
wordnet_lemmatizer.lemmatize("is", pos='v')

'be'

#### NLTK标注POS Tag词性

In [86]:
text = nltk.word_tokenize('what does the fox say')
text

['what', 'does', 'the', 'fox', 'say']

In [87]:
nltk.pos_tag(text) #有错误

[('what', 'WDT'),
 ('does', 'VBZ'),
 ('the', 'DT'),
 ('fox', 'NNS'),
 ('say', 'VBP')]

#### StopWord停用词
全体stopwords列表 http://www.ranks.nl/stopwords

⾸先记得在console⾥⾯下载⼀下词库
或者 nltk.download(‘stopwords’)

In [88]:
from nltk.corpus import stopwords

In [89]:
text = """Oh, baby with your pretty face　　
Drop a tear in my wineglass　　
Look at those big eyes"""

In [90]:
word_list = nltk.word_tokenize(text)

In [91]:
filtered_words = [word for word in word_list if word not in stopwords.words("english")]
filtered_words

['Oh',
 ',',
 'baby',
 'pretty',
 'face',
 'Drop',
 'tear',
 'wineglass',
 'Look',
 'big',
 'eyes']

### ⼀条typical的⽂本预处理流⽔线

In [92]:
# Raw_Text
#   |
# Tokenize  --->POS Tag
#   |       /
# Lemma/Stemming
#   |
# stopwords
#   |
# Word_List

#### NLTK在NLP上的经典应⽤

情感分析,⽂本相似度,⽂本分类

#### 情感分析
最简单的 sentiment dictionary
如:
like 1 
good 2 
bad -2 
terrible -3

类似于关键词打分机制

⽐如：AFINN-111
http://www2.imm.dtu.dk/pubdb/views/publication_details.php?id=6010

#### NLTK完成简单的情感分析

In [110]:
sentiment_dictionary = {}
for line in open(r'C:\study\datasets\NLP\imm6010\AFINN\AFINN-111.txt'):
    word, score = line.split('\t')
    sentiment_dictionary[word] = int(score)
# 把这个打分表记录在⼀一个Dict上以后
# 跑⼀一遍整个句句⼦子，把对应的值相加
#print(sentiment_dictionary)
words = """Oh, baby with your pretty face　　
Drop a tear in my wineglass　　
Look at those big eyes
"""
words = nltk.word_tokenize(words)
total_score = sum(sentiment_dictionary.get(word, 0) for word in words)
# 有值就是Dict中的值，没有就是0
# 于是你就得到了了⼀一个 sentiment score
total_score

2

#### 配上ML的情感分析

In [112]:
from nltk.classify import NaiveBayesClassifier
# 随⼿手造点训练集
s1 = 'this is a good book'
s2 = 'this is a awesome book'
s3 = 'this is a bad book'
s4 = 'this is a terrible book'

def preprocess(s):
    # Func: 句句⼦子处理理
    # 这⾥里里简单的⽤用了了split(), 把句句⼦子中每个单词分开
    # 显然 还有更更多的processing method可以⽤用
    return {word: True for word in s.lower().split()}
    # return⻓长这样:
    # {'this': True, 'is':True, 'a':True, 'good':True, 'book':True}
    # 其中, 前⼀一个叫fname, 对应每个出现的⽂文本单词;
    # 后⼀一个叫fval, 指的是每个⽂文本单词对应的值。
    # 这⾥里里我们⽤用最简单的True,来表示,这个词『出现在当前的句句⼦子中』的意义。
    # 当然啦, 我们以后可以升级这个⽅方程, 让它带有更更加⽜牛逼的fval, ⽐比如 word2vec
    # 把训练集给做成标准形式
training_data = [[preprocess(s1), 'pos'],
[preprocess(s2), 'pos'],
[preprocess(s3), 'neg'],
[preprocess(s4), 'neg']]
# 喂给model吃
model = NaiveBayesClassifier.train(training_data)
# 打出结果
print(model.classify(preprocess('this is a good book')))

pos


#### 文本相似度

⽤元素频率表⽰⽂本特征

#### 余弦定理

cosβ=A*B/(|A| |B|)

#### Frequency频率统计

In [115]:
import nltk
from nltk import FreqDist

# 做个词库先
corpus = 'this is my sentence' \
'this is my life' \
'this is the day'

# 随便分词tokenize一下
# 显然,正如上文提到,这里可以根据需要做任何的preprocessing: stopwords,lemma, stemming, etc.
tokens = nltk.word_tokenize(corpus)
print(tokens)

['this', 'is', 'my', 'sentencethis', 'is', 'my', 'lifethis', 'is', 'the', 'day']


In [123]:
# 借用NLTK的FreqDist统计一下文字出现的频率,并没有排序
fdist = FreqDist(tokens)
fdist
# 它就类似于一个Dist
# 带上某个单词,可以看到它在整个文章中出现的次数
#print(fdist['is'])

FreqDist({'day': 1,
          'is': 3,
          'lifethis': 1,
          'my': 2,
          'sentencethis': 1,
          'the': 1,
          'this': 1})

#### 把最常用的n个单词拿出来FreqDist.most_common(n)

In [121]:
standard_freq_vector = fdist.most_common(5) # 排序好的
size = len(standard_freq_vector)
print(standard_freq_vector)

[('is', 3), ('my', 2), ('this', 1), ('sentencethis', 1), ('lifethis', 1)]


In [127]:
# Func: 按照出现频率大小,记录下每一个单词的位置
def position_lookup(v):
    res = {}
    counter = 0
    for word in v:
        res[word[0]] = counter
        counter += 1
    return res

# 把标准的单词位置记录下来
standard_position_dict = position_lookup(standard_freq_vector)
print(standard_position_dict) #{'is': 0, 'lifethis': 4, 'my': 1, 'sentencethis': 3, 'this': 2}

{'is': 0, 'my': 1, 'this': 2, 'sentencethis': 3, 'lifethis': 4}


In [129]:
# 这时, 如果我们有一个新句子:
sentence = 'this is cool'
# 先新建一个跟我们的标准vector同样大小的向量
freq_vector = [0] * size
# 简单的Preprocessing
tokens = nltk.word_tokenize(sentence)
# 对于这个新句子里的每一个单词
for word in tokens:
    try:
        # 如果在我们的词库里出现过
        # 那么久在标准位置上+1
        print(standard_position_dict[word])
        freq_vector[standard_position_dict[word]] += 1
    except KeyError:
        # 如果是个新词
        # 就pass掉
        continue
print(freq_vector)#[1, 0, 1, 0, 0]
# 第一个位置代表is 出现了一次
# 第三个位置代表this,出现了一次

2
0


[1, 0, 1, 0, 0]

#### 文本分类
#### TF-IDF

####  TF: TremFrequency, 衡量一个term在文档中出现的多频繁.
TF(t) = (t出现在文档中的次数)/(文档中的term总数)
IDF: Inverse Document Frequency, 衡量一个term有多重要.有些词出现的很多,但是明显不是很有卵用,比如is,the之类.
为了平衡,我们把罕见的词的重要性(权重weight)提高,把常见词的重要性降低.
IDF(t) = log_e(文档总数/含有t的文档总数).

#### TF-IDF = TF* IDF

In [130]:
# 举个栗子
# TF: 一个文档中有100个单词,其中单词baby出现了3次, 那么TF(baby)=3/100 = 0.03
# IDF: 我们现在有10 million个文档,baby出现在其中的1000个文档中.
# 那么IDF(baby) = log(10,000,000/1,000)=4
# 所以TF-IDF(baby) = TF(baby)*IDF(baby) = 0.12

#### NLTK 实现TF-IDF

#### TextCollection 断句,统计,计算

In [150]:
from nltk.text import TextCollection
# 首先, 把所有的文档放到TextCollection类中.
# 这个类会自动帮你断句,做统计, 做计算
corpus = TextCollection(['this is'])
print(len(corpus))
# 直接就能算出tfidf
# (term: 一句话中某个term, text:这句话)
print(corpus.tf_idf('is','this is sentence four'))
corpus.idf('is')

7
0.0


0.0

#### 注意TextCollection源码中tfidf的计算公式

In [159]:
# def tf(self, term, text):
#         " The frequency of the term in text. "
#         return text.count(term) / len(text)


# def idf(self, term):
#         """ The number of texts in the corpus divided by the
#         number of texts that the term appears in.
#         If a term does not appear in the corpus, 0.0 is returned. """
#         # idf values are cached for performance.
#         idf = self._idf_cache.get(term)
#         if idf is None:
#             matches = len([True for text in self._texts if term in text])
#             if len(self._texts) == 0:
#                 raise ValueError('IDF undefined for empty document collection')
#             idf = log(len(self._texts) / matches) if matches else 0.0
#             self._idf_cache[term] = idf
#         return idf


# def tf_idf(self, term, text):
#         return self.tf(term, text) * self.idf(term)

#### TextCollection可以直接计算tf,idf,tf_idf值.

In [161]:
from nltk.text import TextCollection
corpus = TextCollection(['this is sentence one','this is sentence two','this is sentence three'])

In [165]:
corpus.tf("this is","this is sentence four")

0.047619047619047616

In [166]:
corpus.idf("this is") # 结果是0, log(3/3)=0

0.0

In [167]:
corpus.tf_idf("this is","this is sentence four")

0.0

#### ML可能的ML模型: SVM LR RF MLP LSTM RNN