# 英文文本预处理

1、英文缩写替换

In [2]:
text = "The story loses its bite in a last-minute happy ending that's even less plausible than the rest of the picture . It's funny ."

text.replace("that's", "that is").replace("It's", "It is")

'The story loses its bite in a last-minute happy ending that is even less plausible than the rest of the picture . It is funny .'

2、大写字母转换为小写字母

In [3]:
text = 'The story loses its bite in a last-minute happy ending that is even less plausible than the rest of the picture . It is funny .'

text.lower()

'the story loses its bite in a last-minute happy ending that is even less plausible than the rest of the picture . it is funny .'

3、删除标点符号、数字及其它特殊字符

In [5]:
import re

text = 'the story loses its bite in a 12 last-minute happy ending %@ that is even less plausible #$ than the rest of the picture . it is funny .'

text = re.sub("[^a-zA-Z]", " ", text)
" ".join(text.split())  # 删除多余的空格

'the story loses its bite in a last minute happy ending that is even less plausible than the rest of the picture it is funny'

4、分词

In [9]:
from nltk.tokenize import word_tokenize

text = 'the story loses its bite in a last minute happy ending that is even less plausible than the rest of the picture it is funny'

print("使用nltk库进行分词：\n", word_tokenize(text))
print("使用split函数进行分词：\n", text.split())

使用nltk库进行分词：
 ['the', 'story', 'loses', 'its', 'bite', 'in', 'a', 'last', 'minute', 'happy', 'ending', 'that', 'is', 'even', 'less', 'plausible', 'than', 'the', 'rest', 'of', 'the', 'picture', 'it', 'is', 'funny']
使用split函数进行分词：
 ['the', 'story', 'loses', 'its', 'bite', 'in', 'a', 'last', 'minute', 'happy', 'ending', 'that', 'is', 'even', 'less', 'plausible', 'than', 'the', 'rest', 'of', 'the', 'picture', 'it', 'is', 'funny']


5、词干提取

In [16]:
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer

# 第一种方法
stem_porter = PorterStemmer()
print(stem_porter.stem('happy'))
print(stem_porter.stem('country'))

# 第二种方法
snowball_stem = SnowballStemmer("english") # 指定语言：english
print(snowball_stem.stem('happy'))
print(snowball_stem.stem('country'))

happi
countri
happi
countri


6、词形还原

In [15]:
from nltk.stem.wordnet import WordNetLemmatizer

lemma = WordNetLemmatizer()
print(lemma.lemmatize('was', pos='v'))
print(lemma.lemmatize('cats', pos='n'))

be
cat


7、删除停用词

In [17]:
from nltk.corpus import stopwords

stop_words = stopwords.words("english")

text = 'the story loses its bite in a last minute happy ending that is even less plausible than the rest of the picture it is funny'

del_stopwords = [w for w in text.split() if w not in stop_words]

print(" ".join(del_stopwords))

story loses bite last minute happy ending even less plausible rest picture funny


# 中文文本预处理

1、删除标点符号、数字、字母及其它字符

In [19]:
import re

text = "$$北京时间5月21日，2019赛季亚洲冠军联赛G、H组将展开小组赛第六轮争夺。%&\
中超两强北京国安和上海上港都面临生死大战，好在两支队伍都掌握着晋级主动权，只要取胜即可锁定小组出线权。@#\
在国内赛场风生水起的北京国安，渴望改写在亚冠联赛从未有过连续从小组中脱颖而出的历史；\
伤兵满营的上海上港，则希望延续只要参加亚冠比赛就必晋级淘汰赛阶段的纪录。$$"

text = re.sub("[，。？‘’“”《》；：！——……（）【】、a-zA-Z0-9@#$%&*/\,.?!""''{}+<>~]", "", text)

print(text)

北京时间月日赛季亚洲冠军联赛组将展开小组赛第六轮争夺中超两强北京国安和上海上港都面临生死大战好在两支队伍都掌握着晋级主动权只要取胜即可锁定小组出线权在国内赛场风生水起的北京国安渴望改写在亚冠联赛从未有过连续从小组中脱颖而出的历史伤兵满营的上海上港则希望延续只要参加亚冠比赛就必晋级淘汰赛阶段的纪录


2、jieba分词

In [22]:
import jieba

text = "他毕业于北京大学计算机学院"

# 精准模式
print(list(jieba.cut(text, cut_all=False)))

# 全模式
print(list(jieba.cut(text, cut_all=True)))

# 搜索引擎模式
print(list(jieba.cut_for_search(text)))

['他', '毕业', '于', '北京大学', '计算机', '学院']
['他', '毕业', '于', '北京', '北京大学', '大学', '计算', '计算机', '算机', '学院']
['他', '毕业', '于', '北京', '大学', '北京大学', '计算', '算机', '计算机', '学院']


3、删除停用词

In [25]:
import jieba

text = "北京时间月日赛季亚洲冠军联赛组将展开小组赛第六轮争夺中超两强北京国安和上海上港都面临生死大战\
好在两支队伍都掌握着晋级主动权只要取胜即可锁定小组出线权在国内赛场风生水起的北京国安渴望改写在亚冠联赛从未有过连续从小组中脱颖而出的历史\
伤兵满营的上海上港则希望延续只要参加亚冠比赛就必晋级淘汰赛阶段的纪录"

stopwords = ["月", "日", "的", "在", "只要" , "风生水起", "则", "好", "和", "都", "就", "必", "着", "从"]  # 停用词

text_cut = list(jieba.cut(text))  # 精准模式
print("精准模式：\n", text_cut)

del_stopwords = [w for w in text_cut if w not in stopwords]  # 删除停用词

print("删除停用词：\n", del_stopwords)

精准模式：
 ['北京', '时间', '月', '日', '赛季', '亚洲', '冠军联赛', '组将', '展开', '小组赛', '第六轮', '争夺', '中超', '两强', '北京国安', '和', '上海', '上港', '都', '面临', '生死', '大战', '好', '在', '两支', '队伍', '都', '掌握', '着', '晋级', '主动权', '只要', '取胜', '即可', '锁定', '小组', '出线权', '在', '国内', '赛场', '风生水', '起', '的', '北京国安', '渴望', '改写', '在', '亚冠', '联赛', '从未有过', '连续', '从', '小组', '中', '脱颖而出', '的', '历史', '伤兵', '满营', '的', '上海', '上港', '则', '希望', '延续', '只要', '参加', '亚冠', '比赛', '就', '必', '晋级', '淘汰赛', '阶段', '的', '纪录']
删除停用词：
 ['北京', '时间', '赛季', '亚洲', '冠军联赛', '组将', '展开', '小组赛', '第六轮', '争夺', '中超', '两强', '北京国安', '上海', '上港', '面临', '生死', '大战', '两支', '队伍', '掌握', '晋级', '主动权', '取胜', '即可', '锁定', '小组', '出线权', '国内', '赛场', '风生水', '起', '北京国安', '渴望', '改写', '亚冠', '联赛', '从未有过', '连续', '小组', '中', '脱颖而出', '历史', '伤兵', '满营', '上海', '上港', '希望', '延续', '参加', '亚冠', '比赛', '晋级', '淘汰赛', '阶段', '纪录']
