# NLP NLKT的使用

## 实例：分析语法树

In [None]:
import nltk

示例字符串："GE data science is all about real-time collaboration. Our teams are scattered around the world, and we want to be able to collaborate in real time. We want to have a common machine learning platform that everyone can use for modern analytics, and set standards that everyone can follow. Anaconda Enterprise allows us to do this in a very efficient way." 

In [None]:
s = """GE data science is all about real-time collaboration. Our teams are scattered around the world, 
and we want to be able to collaborate in real time. 
We want to have a common machine learning platform that everyone can use for modern analytics, 
and set standards that everyone can follow. Anaconda Enterprise allows us to do this in a very efficient way."""

In [None]:
#分词
words = nltk.word_tokenize(s)

words

In [None]:
#标签
tags = nltk.pos_tag(words)
tags

In [None]:
#标签含义
nltk.help.upenn_tagset()

In [None]:
#生成语法树
chunk = nltk.chunk.ne_chunk(tags)
chunk.draw()

## 实例：寻找近似词

In [None]:
import nltk

In [None]:
#布朗语料库
l = nltk.corpus.brown.words()
l

In [None]:
#全部变成小写
l2 = [w.lower() for w in l]
l2

In [None]:
#合成整段文本
text = nltk.Text(l2)
text

In [None]:
#the的近似词（不是近义词）
text.similar('the')

## 实例：通过姓名识别性别

In [None]:
from nltk.corpus import names
import nltk
import random
import math

In [None]:
names.words('male.txt')

In [None]:
names.words('female.txt')

In [None]:
#读取名字，标注性别
l = [(name,'男') for name in names.words('male.txt')] + [(name,'女') for name in names.words('female.txt')]
l

In [None]:
#标注特征————注：这里使用结尾字母为特征
l2 = [({'feature':name[-1]},gender) for (name,gender) in l]
l2

In [None]:
#随机排序
random.shuffle(l2)
l2

In [None]:
#分成两半
train_set = l2[:math.floor(len(l2)/2)]  #floor()下取整
test_set = l2[math.ceil(len(l2)/2):]    #ceil()上取整

In [None]:
#训练数据（朴素贝叶斯分类器）
classifier = nltk.NaiveBayesClassifier.train(train_set)
classifier.classify({"feature":'candy'[-1]})

In [None]:
#测试
res = sum([1 if classifier.classify(features)==gender else 0 for(features,gender) in test_set])/len(test_set)
res

## 其他方法

试一试：使用倒数两个字符

In [None]:
from nltk.corpus import names
import nltk
import random
import math

In [None]:
#读取名字，标注性别  使用结尾2字母为特征
l = [({"feature":name[-2:]},'男') for name in names.words('male.txt')] + [({"feature":name[-2:]},'女') for name in names.words('female.txt')]

In [None]:
#随机排序
random.shuffle(l)

In [None]:
#分成两半
train_set = l[:math.floor(len(l)/2)]  #floor()下取整
test_set = l[math.ceil(len(l)/2):]    #ceil()上取整

In [None]:
#训练数据（朴素贝叶斯分类器）
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [None]:
#测试
res = sum([1 if classifier.classify(features)==gender else 0 for(features,gender) in test_set])/len(test_set)
res

## 另一种方法

In [None]:
from nltk.corpus import names
import nltk
import random
import math 

In [None]:
def get_features(s):
    res = {}
    for w in "abcdefghijklmnopqrstuvwxyz":
        res[w] = 0
    
    for w in s:
        w = w.lower()
        
        if w in res:
            res[w]+=1
    return res

In [None]:
#读取名字，标注性别  使用结尾2字母为特征
l = [(get_features(name),'男') for name in names.words('male.txt')] + [(get_features(name),'女') for name in names.words('female.txt')]

In [None]:
#随机排序
random.shuffle(l)

In [None]:
#分成两半
train_set = l[:math.floor(len(l)/2)]  #floor()下取整
test_set = l[math.ceil(len(l)/2):]    #ceil()上取整

In [None]:
#训练数据（朴素贝叶斯分类器）
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [None]:
#测试
res = sum([1 if classifier.classify(features)==gender else 0 for(features,gender) in test_set])/len(test_set)
res

## 使用首字母与末字母

In [None]:
from nltk.corpus import names
import nltk
import random
import math 

In [None]:
def get_features(s):
    res = {}
    for w in "abcdefghijklmnopqrstuvwxyz":
        res[w] = 0
    
    for w in s:
        w = w.lower()
        
        if w in res:
            res[w]+=1
    return res

In [None]:
#读取名字，标注性别  使用结尾2字母为特征
l = [({"last":name[-1],"first":name[0]},'男') for name in names.words('male.txt')] + [({"last":name[-1],"first":name[0]},'女') for name in names.words('female.txt')]

In [None]:
#随机排序
random.shuffle(l)

In [None]:
#分成两半
train_set = l[:math.floor(len(l)/2)]  #floor()下取整
test_set = l[math.ceil(len(l)/2):]    #ceil()上取整

In [None]:
#训练数据（朴素贝叶斯分类器）
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [None]:
#测试
res = sum([1 if classifier.classify(features)==gender else 0 for(features,gender) in test_set])/len(test_set)
res

## 构建中文语法树

In [None]:
import nltk
import jieba
import jieba.posseg as posseg

示例字符串："美国司法部表示，华特迪士尼获准以713亿美元收购二十一世纪福克斯的娱乐资产，前提条件是迪士尼出售福克斯的22个地方体育电视网络。迪士尼同意出售福克斯的22个地区性体育频道，以解决司法部对该交易可能导致地方市场有线电视体育节目涨价的担心。迪士尼称，很高兴与美国司法部达成这项协议，称希望带来更吸引人的用户体验。此外，美国最大的有线电视运营商康卡斯特仍有意竞购21世纪福克斯的娱乐资产。"

In [None]:
s = """美国司法部表示，华特迪士尼获准以713亿美元收购二十一世纪福克斯的娱乐资产，前提条件是迪士尼出售福克斯的22个地方体育电视网络。迪士尼同意出售福克斯的22个地区性体育频道，以解决司法部对该交易可能导致地方市场有线电视体育节目涨价的担心。迪士尼称，很高兴与美国司法部达成这项协议，称希望带来更吸引人的用户体验。此外，美国最大的有线电视运营商康卡斯特仍有意竞购21世纪福克斯的娱乐资产。"""

In [None]:
#dir() 返回模块的属性列表。
dir(list(posseg.cut(s))[1])

In [None]:
tags = [(item.word,item.flag) for item in list(posseg.cut(s))]

In [None]:
chunk = nltk.chunk.ne_chunk(tags)
chunk.draw()

## 实例：新闻分类

In [2]:
import os
import jieba
import nltk

In [6]:
path = '\text\'

SyntaxError: EOL while scanning string literal (<ipython-input-6-9ba3ec9194d7>, line 1)

In [7]:
#1.读文件
types = os.listdir(os.getcwd() + path)

FileNotFoundError: [WinError 3] 系统找不到指定的路径。: 'C:\\Users\\MrLen\\AnacondaProjects/text/'

In [8]:
datas = {}

In [9]:
for name in types:
    datas[name] = []
    print(name)
    
    for filename in os.listdir(os.getcwd() + path + name):
        fp = open(os.getcwd() + path + name + '/' + filename,'r',encoding = "utf-8")
        content = fp.read()
        fp.close()
        
        datas[name].append(content)

NameError: name 'types' is not defined