In [1]:
import numpy as np
import pandas as pd
import unicodedata
import sys

In [2]:
#读取数据集
url = 'G:\DownLoad\data\Amazon.csv'
df_amazon = pd.read_csv(url)

#创建文本
text_data = df_amazon['Reviews']

In [3]:
#去除文本两端的空格
strip_whitespace = [string.strip() for string in text_data]

#查看文本
strip_whitespace

["I feel so LUCKY to have found this used (phone to us & not used hard at all), phone on line from someone who upgraded and sold this one. My Son liked his old one that finally fell apart after 2.5+ years and didn't want an upgrade!! Thank you Seller, we really appreciate it & your honesty re: said used phone.I recommend this seller very highly & would but from them again!!",
 'nice phone, nice up grade from my pantach revue. Very clean set up and easy set up. never had an android phone but they are fantastic to say the least. perfect size for surfing and social media. great phone samsung',
 'Very pleased',
 'It works good but it goes slow sometimes but its a very good phone I love it',
 'Great phone to replace my lost phone. The only thing is the volume up button does not work, but I can still go into settings to adjust. Other than that, it does the job until I am eligible to upgrade my phone again.Thaanks!',
 'I already had a phone with problems... I know it stated it was used, but d

In [4]:
#删除句点
remove_periods = [string.replace(".", "") for string in strip_whitespace]
#查看文本
remove_periods

["I feel so LUCKY to have found this used (phone to us & not used hard at all), phone on line from someone who upgraded and sold this one My Son liked his old one that finally fell apart after 25+ years and didn't want an upgrade!! Thank you Seller, we really appreciate it & your honesty re: said used phoneI recommend this seller very highly & would but from them again!!",
 'nice phone, nice up grade from my pantach revue Very clean set up and easy set up never had an android phone but they are fantastic to say the least perfect size for surfing and social media great phone samsung',
 'Very pleased',
 'It works good but it goes slow sometimes but its a very good phone I love it',
 'Great phone to replace my lost phone The only thing is the volume up button does not work, but I can still go into settings to adjust Other than that, it does the job until I am eligible to upgrade my phone againThaanks!',
 'I already had a phone with problems I know it stated it was used, but dang, it did n

In [5]:
#创建一个标点字典
punctuation = dict.fromkeys(i for i in range(sys.maxunicode)
                           if unicodedata.category(chr(i)).startswith('P'))
#移除每个字符串中的标点
text_data1 = [string.translate(punctuation) for string in text_data]

#查看文本
text_data1

['I feel so LUCKY to have found this used phone to us  not used hard at all phone on line from someone who upgraded and sold this one My Son liked his old one that finally fell apart after 25+ years and didnt want an upgrade Thank you Seller we really appreciate it  your honesty re said used phoneI recommend this seller very highly  would but from them again',
 'nice phone nice up grade from my pantach revue Very clean set up and easy set up never had an android phone but they are fantastic to say the least perfect size for surfing and social media great phone samsung',
 'Very pleased',
 'It works good but it goes slow sometimes but its a very good phone I love it',
 'Great phone to replace my lost phone The only thing is the volume up button does not work but I can still go into settings to adjust Other than that it does the job until I am eligible to upgrade my phone againThaanks',
 'I already had a phone with problems I know it stated it was used but dang it did not state that it di

In [6]:
#提取词干
from nltk.stem.porter import PorterStemmer

#创建词干转化器
porter = PorterStemmer()

#应用词干
text_word = [porter.stem(word) for word in text_data1]

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

#创建一个词袋特征矩阵
count = CountVectorizer()
bag_of_words = count.fit_transform(text_word)

#查看特征名
bag_of_words
#上述代码的输出是一个稀疏矩阵

<349x2425 sparse matrix of type '<class 'numpy.int64'>'
	with 10436 stored elements in Compressed Sparse Row format>

In [8]:
#可以使用 toarray 查看每个观察值的词频统计矩阵 
print('toarray:')
bag_of_words.toarray()


toarray:


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [9]:
#使用 get_feature_names 方法能查看每个特征所对应的单词 
print('get)feature_names')
count.get_feature_names()

get)feature_names


['10',
 '100',
 '1080p',
 '1080pthis',
 '110',
 '112017',
 '115',
 '12',
 '1400',
 '1425',
 '15',
 '16',
 '17002100',
 '18',
 '1900',
 '1999',
 '1gb',
 '20',
 '2001',
 '200mb',
 '200pm',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '20month',
 '2100',
 '2300mah',
 '24',
 '24hour',
 '25',
 '25cm',
 '288',
 '2999',
 '2g',
 '2gand',
 '2gb',
 '2pm',
 '30',
 '3000',
 '302',
 '303',
 '30ish',
 '32mp',
 '35g',
 '35mm',
 '3999',
 '3d',
 '3g',
 '3inch',
 '3weeks',
 '40',
 '42',
 '422',
 '44',
 '442',
 '450',
 '4day',
 '4g',
 '4s',
 '50',
 '512mb',
 '55',
 '5th',
 '60',
 '700',
 '70s',
 '710',
 '78',
 '79',
 '82',
 '84',
 '84yearold',
 '850',
 '90',
 '91',
 '911another',
 '92',
 '93',
 '95',
 'aarp',
 'ability',
 'able',
 'ablebto',
 'about',
 'above',
 'absolutely',
 'abysmal',
 'accent',
 'accept',
 'accepting',
 'accepts',
 'access',
 'accessing',
 'accessories',
 'accident',
 'accidentally',
 'accommodate',
 'accomodate',
 'accomplishing',
 'account',
 'accounta',
 'accurate',
 'activate',


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

#创建一个TF-IDF特征矩阵
tfidf = TfidfVectorizer()
feature_matrix = tfidf.fit_transform(text_word)

#查看TF-IDF特征矩阵
feature_matrix

<349x2425 sparse matrix of type '<class 'numpy.float64'>'
	with 10436 stored elements in Compressed Sparse Row format>

In [11]:
#查看TF-IDF特征矩阵的稠密矩阵形式
feature_matrix.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [12]:
#查看特征的名字
tfidf.vocabulary_

{'feel': 793,
 'so': 1960,
 'lucky': 1255,
 'to': 2183,
 'have': 961,
 'found': 841,
 'this': 2159,
 'used': 2276,
 'phone': 1554,
 'us': 2270,
 'not': 1427,
 'hard': 952,
 'at': 228,
 'all': 152,
 'on': 1463,
 'line': 1210,
 'from': 854,
 'someone': 1974,
 'who': 2364,
 'upgraded': 2263,
 'and': 171,
 'sold': 1965,
 'one': 1466,
 'my': 1383,
 'son': 1982,
 'liked': 1204,
 'his': 989,
 'old': 1460,
 'that': 2137,
 'finally': 806,
 'fell': 795,
 'apart': 194,
 'after': 132,
 '25': 31,
 'years': 2412,
 'didnt': 609,
 'want': 2322,
 'an': 170,
 'upgrade': 2262,
 'thank': 2135,
 'you': 2418,
 'seller': 1863,
 'we': 2339,
 'really': 1720,
 'appreciate': 203,
 'it': 1098,
 'your': 2420,
 'honesty': 1002,
 're': 1709,
 'said': 1823,
 'phonei': 1556,
 'recommend': 1739,
 'very': 2301,
 'highly': 986,
 'would': 2402,
 'but': 355,
 'them': 2142,
 'again': 133,
 'nice': 1412,
 'up': 2258,
 'grade': 916,
 'pantach': 1514,
 'revue': 1798,
 'clean': 453,
 'set': 1879,
 'easy': 676,
 'never': 1407,
 