# 3 处理原始文本

In [1]:
import nltk, re, pprint
from nltk import word_tokenize

## 3.1 从网络和硬盘访问文本

### 1、从网络上下载文本

In [57]:
from urllib import request
url = "https://www.gutenberg.org/files/2554/2554-0.txt"
response = request.urlopen(url)
raw = response.read().decode("utf8")
print(type(raw),"\n", len(raw),"\n")
raw[:80]

<class 'str'> 
 1176967 



'\ufeffThe Project Gutenberg EBook of Crime and Punishment, by Fyodor Dostoevsky\r\n\r\nTh'

### 2、分词

In [40]:
tokens = word_tokenize(raw)
print (type(tokens),"\n",len(tokens))
tokens[:15]

<class 'list'> 
 257727


['\ufeffThe',
 'Project',
 'Gutenberg',
 'EBook',
 'of',
 'Crime',
 'and',
 'Punishment',
 ',',
 'by',
 'Fyodor',
 'Dostoevsky',
 'This',
 'eBook',
 'is']

### 3、从这个列表创建一个NLTK 文本

In [41]:
text = nltk.Text(tokens)
print(type(text),"\n")
print(text[1024:1062],"\n")
text.collocations()                # 常用搭配

<class 'nltk.text.Text'> 

['an', 'exceptionally', 'hot', 'evening', 'early', 'in', 'July', 'a', 'young', 'man', 'came', 'out', 'of', 'the', 'garret', 'in', 'which', 'he', 'lodged', 'in', 'S.', 'Place', 'and', 'walked', 'slowly', ',', 'as', 'though', 'in', 'hesitation', ',', 'towards', 'K.', 'bridge', '.', 'He', 'had', 'successfully'] 

Katerina Ivanovna; Pyotr Petrovitch; Pulcheria Alexandrovna; Avdotya
Romanovna; Rodion Romanovitch; Marfa Petrovna; Sofya Semyonovna; old
woman; Project Gutenberg-tm; Porfiry Petrovitch; Amalia Ivanovna;
great deal; young man; Nikodim Fomitch; Ilya Petrovitch; Project
Gutenberg; Andrey Semyonovitch; Hay Market; Dmitri Prokofitch; Good
heavens


## 4、手工检查文件以发现标记内容开始和结尾的独特的字符串

In [58]:
print(raw.find("PART I"),"\n")
print(raw.rfind("End of Project Gutenberg’s Crime")) # 注意，这里的 ’  是中文符号下的 ‘    
         # 这里的raw.rfind() 是反向find的意思

5336 

1157812


In [59]:
raw1 = raw[5336:1157812]
raw1.find("PART I")

0

## 5、处理HTML

In [60]:
url = "http://news.bbc.co.uk/2/hi/health/2284783.stm"
html = request.urlopen(url).read().decode("utf8")
html[:60]

'<!doctype html public "-//W3C//DTD HTML 4.0 Transitional//EN'

In [62]:
from bs4 import BeautifulSoup
raw = BeautifulSoup(html).get_text()
tokens = word_tokenize(raw)
tokens

['BBC',
 'NEWS',
 '|',
 'Health',
 '|',
 'Blondes',
 "'to",
 'die',
 'out',
 'in',
 '200',
 "years'",
 'NEWS',
 'SPORT',
 'WEATHER',
 'WORLD',
 'SERVICE',
 'A-Z',
 'INDEX',
 'SEARCH',
 'You',
 'are',
 'in',
 ':',
 'Health',
 'News',
 'Front',
 'Page',
 'Africa',
 'Americas',
 'Asia-Pacific',
 'Europe',
 'Middle',
 'East',
 'South',
 'Asia',
 'UK',
 'Business',
 'Entertainment',
 'Science/Nature',
 'Technology',
 'Health',
 'Medical',
 'notes',
 '--',
 '--',
 '--',
 '--',
 '--',
 '--',
 '-',
 'Talking',
 'Point',
 '--',
 '--',
 '--',
 '--',
 '--',
 '--',
 '-',
 'Country',
 'Profiles',
 'In',
 'Depth',
 '--',
 '--',
 '--',
 '--',
 '--',
 '--',
 '-',
 'Programmes',
 '--',
 '--',
 '--',
 '--',
 '--',
 '--',
 '-',
 'SERVICES',
 'Daily',
 'E-mail',
 'News',
 'Ticker',
 'Mobile/PDAs',
 '--',
 '--',
 '--',
 '--',
 '--',
 '--',
 '-',
 'Text',
 'Only',
 'Feedback',
 'Help',
 'EDITIONS',
 'Change',
 'to',
 'UK',
 'Friday',
 ',',
 '27',
 'September',
 ',',
 '2002',
 ',',
 '11:51',
 'GMT',
 '12:51'

In [63]:
tokens = tokens[110:390]
text = nltk.Text(tokens)
text.concordance('gene')

Displaying 5 of 5 matches:
hey say too few people now carry the gene for blondes to last beyond the next 
blonde hair is caused by a recessive gene . In order for a child to have blond
 have blonde hair , it must have the gene on both sides of the family in the g
ere is a disadvantage of having that gene or by chance . They do n't disappear
des would disappear is if having the gene was a disadvantage and I do not thin


## 6、处理搜索引擎的结果

In [67]:
import feedparser
llog = feedparser.parse("http://feed.cnblogs.com/blog/sitehome/rss")
llog

{'feed': {'title': '博客园_首页',
  'title_detail': {'type': 'text/plain',
   'language': None,
   'base': 'http://feed.cnblogs.com/blog/sitehome/rss',
   'value': '博客园_首页'},
  'subtitle': '代码改变世界',
  'subtitle_detail': {'type': 'text/plain',
   'language': None,
   'base': 'http://feed.cnblogs.com/blog/sitehome/rss',
   'value': '代码改变世界'},
  'id': 'uuid:386a934f-6ca4-419e-8428-9296c17f29e8;id=457741',
  'guidislink': True,
  'link': 'uuid:386a934f-6ca4-419e-8428-9296c17f29e8;id=457741',
  'updated': '2019-04-23T05:02:19Z',
  'updated_parsed': time.struct_time(tm_year=2019, tm_mon=4, tm_mday=23, tm_hour=5, tm_min=2, tm_sec=19, tm_wday=1, tm_yday=113, tm_isdst=0),
  'generator_detail': {'name': 'feed.cnblogs.com'},
  'generator': 'feed.cnblogs.com'},
 'entries': [{'id': 'http://www.cnblogs.com/V1haoge/p/10755412.html',
   'guidislink': True,
   'link': 'http://www.cnblogs.com/V1haoge/p/10755412.html',
   'title': 'Java设计模式系列-抽象工厂模式 - 唯一浩哥',
   'title_detail': {'type': 'text/plain',
    'lang

In [69]:
llog["feed"]["title"]

'博客园_首页'

In [70]:
len(llog.entries)

20

In [72]:
post = llog.entries[2]
post.title

'Python爬虫入门教程 62-100 30岁了，想找点文献提高自己，还被反爬了，Python搞起，反爬第2篇 - 梦想橡皮擦'

In [73]:
content = post.content[0].value
content[:70]

'【摘要】学术搜索 学习理论的知识少不了去检索文献，好多文献为你的实操提供了合理的支撑，我所在的大学内网默认是有知网账户的，非常NICE 今天'

In [74]:
raw = BeautifulSoup(content).get_text()
word_tokenize(raw)

['【摘要】学术搜索',
 '学习理论的知识少不了去检索文献，好多文献为你的实操提供了合理的支撑，我所在的大学内网默认是有知网账户的，非常NICE',
 '今天要完成的网站是',
 'http',
 ':',
 '//ac.scmor.com/',
 'Google学术搜索是一个文献检索服务，目前主要是提供维普资讯、万方数据等几个学术文献资源库的检索服务。通过G',
 '阅读全文']

## 7、读取本地文件

In [83]:
f = open("3.document.txt",'r') # 'r'意味着以只读方式打开文件（默认），'U'表示“通用”，它让我们忽略不同的换行约定。
raw = f.read()
raw

' 沁园春·雪\n作者：毛泽东\n北国风光，千里冰封，万里雪飘。\n望长城内外，惟余莽莽；大河上下，顿失滔滔。\n山舞银蛇，原驰蜡象，欲与天公试比高。\n须晴日，看红装素裹，分外妖娆。\n江山如此多娇，引无数英雄竞折腰。\n惜秦皇汉武，略输文采；唐宗宋祖，稍逊风骚。 '

In [79]:
import os
os.listdir(".")

['3.deal_with_text.ipynb', '.ipynb_checkpoints', '3.document.txt']

In [88]:
f = open("3.document.txt","r")
for line in f:
    print(line.strip()) # strip()方法删除输入行结尾的换行符。

沁园春·雪
作者：毛泽东
北国风光，千里冰封，万里雪飘。
望长城内外，惟余莽莽；大河上下，顿失滔滔。
山舞银蛇，原驰蜡象，欲与天公试比高。
须晴日，看红装素裹，分外妖娆。
江山如此多娇，引无数英雄竞折腰。
惜秦皇汉武，略输文采；唐宗宋祖，稍逊风骚。


## 8、从PDF、MS Word 及其他二进制格式中提取文本

文字常常以二进制格式出现，如PDF 和MSWord，只能使用专门的软件打开。第三方函数库如pypdf和pywin32提供了对这些格式的访问。

## 9、NLP 的流程

![3.1.png](3.1.png)

In [90]:
>>> a = [1, 2, 3, 4, 5, 6, 7, 6, 5, 4, 3, 2, 1]
>>> b = [' ' * 2 * (7 - i) + 'very' * i for i in a]
>>> for line in b:
...     print(line)

            very
          veryvery
        veryveryvery
      veryveryveryvery
    veryveryveryveryvery
  veryveryveryveryveryvery
veryveryveryveryveryveryvery
  veryveryveryveryveryvery
    veryveryveryveryvery
      veryveryveryvery
        veryveryvery
          veryvery
            very


In [93]:
help(str)

Help on class str in module builtins:

class str(object)
 |  str(object='') -> str
 |  str(bytes_or_buffer[, encoding[, errors]]) -> str
 |  
 |  Create a new string object from the given object. If encoding or
 |  errors is specified, then the object must expose a data buffer
 |  that will be decoded using the given encoding and error handler.
 |  Otherwise, returns the result of object.__str__() (if defined)
 |  or repr(object).
 |  encoding defaults to sys.getdefaultencoding().
 |  errors defaults to 'strict'.
 |  
 |  Methods defined here:
 |  
 |  __add__(self, value, /)
 |      Return self+value.
 |  
 |  __contains__(self, key, /)
 |      Return key in self.
 |  
 |  __eq__(self, value, /)
 |      Return self==value.
 |  
 |  __format__(...)
 |      S.__format__(format_spec) -> str
 |      
 |      Return a formatted version of S as described by format_spec.
 |  
 |  __ge__(self, value, /)
 |      Return self>=value.
 |  
 |  __getattribute__(self, name, /)
 |      Return getatt

列表中的元素可以很大也可以很小，只要我们喜欢：例如，它们可能是段落、句子、短语、单词、字符。

因此，我们在一段NLP 代码中可能做的第一件事情就是将一个字符串分词放入一个字符串列表中。

相反，当我们要将结果写入到一个文件或终端，我们通常会将它们格式化为一个字符串。

# 3.4 使用正则表达式检测词组搭配

In [97]:
import re
wordlist = [w for w in nltk.corpus.words.words("en") if w.islower()]
wordlist

['a',
 'aa',
 'aal',
 'aalii',
 'aam',
 'aardvark',
 'aardwolf',
 'aba',
 'abac',
 'abaca',
 'abacate',
 'abacay',
 'abacinate',
 'abacination',
 'abaciscus',
 'abacist',
 'aback',
 'abactinal',
 'abactinally',
 'abaction',
 'abactor',
 'abaculus',
 'abacus',
 'abaff',
 'abaft',
 'abaisance',
 'abaiser',
 'abaissed',
 'abalienate',
 'abalienation',
 'abalone',
 'abampere',
 'abandon',
 'abandonable',
 'abandoned',
 'abandonedly',
 'abandonee',
 'abandoner',
 'abandonment',
 'abaptiston',
 'abarthrosis',
 'abarticular',
 'abarticulation',
 'abas',
 'abase',
 'abased',
 'abasedly',
 'abasedness',
 'abasement',
 'abaser',
 'abash',
 'abashed',
 'abashedly',
 'abashedness',
 'abashless',
 'abashlessly',
 'abashment',
 'abasia',
 'abasic',
 'abask',
 'abastardize',
 'abatable',
 'abate',
 'abatement',
 'abater',
 'abatis',
 'abatised',
 'abaton',
 'abator',
 'abattoir',
 'abature',
 'abave',
 'abaxial',
 'abaxile',
 'abaze',
 'abb',
 'abbacomes',
 'abbacy',
 'abbas',
 'abbasi',
 'abbassi',


## 1、使用基本的元字符

In [98]:
[w for w in wordlist if re.search("ed$", w)]

['abaissed',
 'abandoned',
 'abased',
 'abashed',
 'abatised',
 'abed',
 'aborted',
 'abridged',
 'abscessed',
 'absconded',
 'absorbed',
 'abstracted',
 'abstricted',
 'accelerated',
 'accepted',
 'accidented',
 'accoladed',
 'accolated',
 'accomplished',
 'accosted',
 'accredited',
 'accursed',
 'accused',
 'accustomed',
 'acetated',
 'acheweed',
 'aciculated',
 'aciliated',
 'acknowledged',
 'acorned',
 'acquainted',
 'acquired',
 'acquisited',
 'acred',
 'aculeated',
 'addebted',
 'added',
 'addicted',
 'addlebrained',
 'addleheaded',
 'addlepated',
 'addorsed',
 'adempted',
 'adfected',
 'adjoined',
 'admired',
 'admitted',
 'adnexed',
 'adopted',
 'adossed',
 'adreamed',
 'adscripted',
 'aduncated',
 'advanced',
 'advised',
 'aeried',
 'aethered',
 'afeared',
 'affected',
 'affectioned',
 'affined',
 'afflicted',
 'affricated',
 'affrighted',
 'affronted',
 'aforenamed',
 'afterfeed',
 'aftershafted',
 'afterthoughted',
 'afterwitted',
 'agazed',
 'aged',
 'agglomerated',
 'aggri

In [99]:
[w for w in wordlist if re.search("^..j..t..$", w)] # 匹配第三个是j第六个是t的8个字母组成的单词

['abjectly',
 'adjuster',
 'dejected',
 'dejectly',
 'injector',
 'majestic',
 'objectee',
 'objector',
 'rejecter',
 'rejector',
 'unjilted',
 'unjolted',
 'unjustly']

In [101]:
[w for w in wordlist if re.search("..j..t..", w)] # 如果限制 ^ 匹配字符的开始 $ 匹配字符的结尾，那么会有很多超过8字符的被匹配到

['abjectedness',
 'abjection',
 'abjective',
 'abjectly',
 'abjectness',
 'adjection',
 'adjectional',
 'adjectival',
 'adjectivally',
 'adjective',
 'adjectively',
 'adjectivism',
 'adjectivitis',
 'adjustable',
 'adjustably',
 'adjustage',
 'adjustation',
 'adjuster',
 'adjustive',
 'adjustment',
 'antejentacular',
 'antiprojectivity',
 'bijouterie',
 'coadjustment',
 'cojusticiar',
 'conjective',
 'conjecturable',
 'conjecturably',
 'conjectural',
 'conjecturalist',
 'conjecturality',
 'conjecturally',
 'conjecture',
 'conjecturer',
 'coprojector',
 'counterobjection',
 'dejected',
 'dejectedly',
 'dejectedness',
 'dejectile',
 'dejection',
 'dejectly',
 'dejectory',
 'dejecture',
 'disjection',
 'guanajuatite',
 'inadjustability',
 'inadjustable',
 'injectable',
 'injection',
 'injector',
 'injustice',
 'insubjection',
 'interjection',
 'interjectional',
 'interjectionalize',
 'interjectionally',
 'interjectionary',
 'interjectionize',
 'interjectiveness',
 'interjector',
 'interje

In [102]:
sum(1 for w in wordlist if re.search("^e-?mail$", w)) # ? 匹配前边的字符0次或1次         # 这行代码的意思是统计总共由多少email或e-mail 

0

## 2、范围与闭包


![3.2](3.2.png)

In [105]:
# 通过序列4653输入。还有哪些其它词汇由相同的序列产生？
[w for w in wordlist if re.search("^[ghi][mno][hkl][def]$", w)]

['gold', 'golf', 'hold', 'hole']

In [107]:
# 匹配只使用中间行的4、5、6 键的词汇
[w for w in wordlist if re.search("^[g-o]+$", w)]   # - 表示范围 + 表示匹配1次或多次

['g',
 'ghoom',
 'gig',
 'giggling',
 'gigolo',
 'gilim',
 'gill',
 'gilling',
 'gilo',
 'gim',
 'gin',
 'ging',
 'gingili',
 'gink',
 'ginkgo',
 'ginning',
 'gio',
 'glink',
 'glom',
 'glonoin',
 'gloom',
 'glooming',
 'gnomon',
 'go',
 'gog',
 'gogo',
 'goi',
 'going',
 'gol',
 'goli',
 'gon',
 'gong',
 'gonion',
 'goo',
 'googol',
 'gook',
 'gool',
 'goon',
 'h',
 'hi',
 'high',
 'hill',
 'him',
 'hin',
 'hing',
 'hinoki',
 'ho',
 'hog',
 'hoggin',
 'hogling',
 'hoi',
 'hoin',
 'holing',
 'holl',
 'hollin',
 'hollo',
 'hollong',
 'holm',
 'homo',
 'homologon',
 'hong',
 'honk',
 'hook',
 'hoon',
 'i',
 'igloo',
 'ihi',
 'ilk',
 'ill',
 'imi',
 'imino',
 'immi',
 'in',
 'ing',
 'ingoing',
 'inion',
 'ink',
 'inkling',
 'inlook',
 'inn',
 'inning',
 'io',
 'ion',
 'j',
 'jhool',
 'jig',
 'jing',
 'jingling',
 'jingo',
 'jinjili',
 'jink',
 'jinn',
 'jinni',
 'jo',
 'jog',
 'johnin',
 'join',
 'joining',
 'joll',
 'joom',
 'k',
 'kiki',
 'kil',
 'kilhig',
 'kilim',
 'kill',
 'killing',

In [109]:
chat_words = sorted(set(w for w in nltk.corpus.nps_chat.words()))
[w for w in chat_words if re.search("^m+i+n+e+$", w)]  # + 表示匹配1次或多次

['miiiiiiiiiiiiinnnnnnnnnnneeeeeeeeee',
 'miiiiiinnnnnnnnnneeeeeeee',
 'mine',
 'mmmmmmmmiiiiiiiiinnnnnnnnneeeeeeee']

In [111]:
[w for w in chat_words if re.search("^m*i*n*e*$", w)]   # * 表示匹配0次或多次

['',
 'e',
 'i',
 'in',
 'm',
 'me',
 'meeeeeeeeeeeee',
 'mi',
 'miiiiiiiiiiiiinnnnnnnnnnneeeeeeeeee',
 'miiiiiinnnnnnnnnneeeeeeee',
 'min',
 'mine',
 'mm',
 'mmm',
 'mmmm',
 'mmmmm',
 'mmmmmm',
 'mmmmmmmmiiiiiiiiinnnnnnnnneeeeeeee',
 'mmmmmmmmmm',
 'mmmmmmmmmmmmm',
 'mmmmmmmmmmmmmm',
 'n',
 'ne']

In [110]:
[w for w in chat_words if re.search("^[ha]+$", w)]  # [ ] 匹配集合里边的没有顺序

['a',
 'aaaaaaaaaaaaaaaaa',
 'aaahhhh',
 'ah',
 'ahah',
 'ahahah',
 'ahh',
 'ahhahahaha',
 'ahhh',
 'ahhhh',
 'ahhhhhh',
 'ahhhhhhhhhhhhhh',
 'h',
 'ha',
 'haaa',
 'hah',
 'haha',
 'hahaaa',
 'hahah',
 'hahaha',
 'hahahaa',
 'hahahah',
 'hahahaha',
 'hahahahaaa',
 'hahahahahaha',
 'hahahahahahaha',
 'hahahahahahahahahahahahahahahaha',
 'hahahhahah',
 'hahhahahaha']

In [120]:
wsj = sorted(set(nltk.corpus.treebank.words()))
[w for w in wsj if re.search("^[0-9]+\.[0-9]+$", w)] # \. 表示后边的字符.不在具有转义含义而是字面的表示 . 

['0.0085',
 '0.05',
 '0.1',
 '0.16',
 '0.2',
 '0.25',
 '0.28',
 '0.3',
 '0.4',
 '0.5',
 '0.50',
 '0.54',
 '0.56',
 '0.60',
 '0.7',
 '0.82',
 '0.84',
 '0.9',
 '0.95',
 '0.99',
 '1.01',
 '1.1',
 '1.125',
 '1.14',
 '1.1650',
 '1.17',
 '1.18',
 '1.19',
 '1.2',
 '1.20',
 '1.24',
 '1.25',
 '1.26',
 '1.28',
 '1.35',
 '1.39',
 '1.4',
 '1.457',
 '1.46',
 '1.49',
 '1.5',
 '1.50',
 '1.55',
 '1.56',
 '1.5755',
 '1.5805',
 '1.6',
 '1.61',
 '1.637',
 '1.64',
 '1.65',
 '1.7',
 '1.75',
 '1.76',
 '1.8',
 '1.82',
 '1.8415',
 '1.85',
 '1.8500',
 '1.9',
 '1.916',
 '1.92',
 '10.19',
 '10.2',
 '10.5',
 '107.03',
 '107.9',
 '109.73',
 '11.10',
 '11.5',
 '11.57',
 '11.6',
 '11.72',
 '11.95',
 '112.9',
 '113.2',
 '116.3',
 '116.4',
 '116.7',
 '116.9',
 '118.6',
 '12.09',
 '12.5',
 '12.52',
 '12.68',
 '12.7',
 '12.82',
 '12.97',
 '120.7',
 '1206.26',
 '121.6',
 '126.1',
 '126.15',
 '127.03',
 '129.91',
 '13.1',
 '13.15',
 '13.5',
 '13.50',
 '13.625',
 '13.65',
 '13.73',
 '13.8',
 '13.90',
 '130.6',
 '130.7',
 '

In [113]:
[w for w in wsj if re.search("^[A-Z]+\$$", w)]

['C$', 'US$']

In [115]:
[w for w in wsj if re.search("^[0-9]{4}$", w)] # {4} 表示匹配前边的字符活着集合四次

['1614',
 '1637',
 '1787',
 '1901',
 '1903',
 '1917',
 '1925',
 '1929',
 '1933',
 '1934',
 '1948',
 '1953',
 '1955',
 '1956',
 '1961',
 '1965',
 '1966',
 '1967',
 '1968',
 '1969',
 '1970',
 '1971',
 '1972',
 '1973',
 '1975',
 '1976',
 '1977',
 '1979',
 '1980',
 '1981',
 '1982',
 '1983',
 '1984',
 '1985',
 '1986',
 '1987',
 '1988',
 '1989',
 '1990',
 '1991',
 '1992',
 '1993',
 '1994',
 '1995',
 '1996',
 '1997',
 '1998',
 '1999',
 '2000',
 '2005',
 '2009',
 '2017',
 '2019',
 '2029',
 '3057',
 '8300']

In [116]:
[w for w in wsj if re.search("^[0-9]+-[a-z]{3,5}$", w)] # 中间的 - 表示字符本身， {3,5} 表示匹配前边的字符或组合3次或5次

['10-day',
 '10-lap',
 '10-year',
 '100-share',
 '12-point',
 '12-year',
 '14-hour',
 '15-day',
 '150-point',
 '190-point',
 '20-point',
 '20-stock',
 '21-month',
 '237-seat',
 '240-page',
 '27-year',
 '30-day',
 '30-point',
 '30-share',
 '30-year',
 '300-day',
 '36-day',
 '36-store',
 '42-year',
 '50-state',
 '500-stock',
 '52-week',
 '69-point',
 '84-month',
 '87-store',
 '90-day']

In [118]:
[w for w in wsj if re.search("^[a-z]{5,}-[a-z]{2,3}-[a-z]{,6}$", w)]  # {5,} 表示匹配前边的字符或组合5次或5次以上 {,6} 表示匹配前边的字符或组合6次或6次以下

['black-and-white',
 'bread-and-butter',
 'father-in-law',
 'machine-gun-toting',
 'savings-and-loan']

In [119]:
[w for w in wsj if re.search("(ed|ing)$", w)] # (ed|ing) 表示匹配已组合ed或者ing结尾的单词

['62%-owned',
 'Absorbed',
 'According',
 'Adopting',
 'Advanced',
 'Advancing',
 'Alfred',
 'Allied',
 'Annualized',
 'Anything',
 'Arbitrage-related',
 'Arbitraging',
 'Asked',
 'Assuming',
 'Atlanta-based',
 'Baking',
 'Banking',
 'Beginning',
 'Beijing',
 'Being',
 'Bermuda-based',
 'Betting',
 'Boeing',
 'Broadcasting',
 'Bucking',
 'Buying',
 'Calif.-based',
 'Change-ringing',
 'Citing',
 'Concerned',
 'Confronted',
 'Conn.based',
 'Consolidated',
 'Continued',
 'Continuing',
 'Declining',
 'Defending',
 'Depending',
 'Designated',
 'Determining',
 'Developed',
 'Died',
 'During',
 'Encouraged',
 'Encouraging',
 'English-speaking',
 'Estimated',
 'Everything',
 'Excluding',
 'Exxon-owned',
 'Faulding',
 'Fed',
 'Feeding',
 'Filling',
 'Filmed',
 'Financing',
 'Following',
 'Founded',
 'Fracturing',
 'Francisco-based',
 'Fred',
 'Funded',
 'Funding',
 'Generalized',
 'Germany-based',
 'Getting',
 'Guaranteed',
 'Having',
 'Heating',
 'Heightened',
 'Holding',
 'Housing',
 'Illumin

In [124]:
for i in [w for w in wsj if re.search("ed|ing$", w)]:                # 不加() 只要遇到ed就匹配截止
    if i not in [w for w in wsj if re.search("(ed|ing)$", w)]:
        print (i)

Biedermann
Breeden
Cathedral
Cedric
Confederation
Credit
Federal
Federalist
Federation
Freddie
Frederick
Friedrichs
Impediments
Intermediate
Kennedy
Media
Medical
Medicine
Mercedes
Montedison
Nederlanden
Needham
Proceeds
Reddington
Redevelopment
Roederer
Speedway
Sweden
Teddy
Toledo
Wednesday
Wedtech
acknowledge
acknowledges
agreed-upon
allegedly
beds
buttoned-down
closed-end
comedies
concede
concedes
credentials
credibility
credit
creditor
creditors
credits
creditworthiness
deeds
discredit
edition
editions
editor
editorial
editorially
editors
education
educational
educators
exceedingly
exceeds
federal
federally
feeds
fixed-income
fixed-price
fixed-rate
freedom
freedoms
greedy
hundreds
immediate
immediately
impede
incredible
ingredients
intermediate
knowledge
knowledgeable
limited-partnership
medallions
media
medical
medicine
mediocre
needle-like
needs
needy
obedient
pediatrician
pianist-comedian
precedent
precedes
predecessor
predict
predictable
predictably
predicts
predispose
procedu

In [126]:
[w for w in wsj if re.search("w(i|e|ai|oo)t", w)] # 匹配含有wit，wet，wait，woot

['Hymowitz',
 'Switzerland',
 'awaits',
 'bellwether',
 'notwithstanding',
 'switch',
 'switched',
 'wait',
 'waited',
 'waiting',
 'wherewithal',
 'witches',
 'with',
 'withdraw',
 'withdrawal',
 'withdrawn',
 'withdrew',
 'withhold',
 'within',
 'without',
 'withstand',
 'witness',
 'witnesses']