In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import re
from sklearn.feature_extraction.text import CountVectorizer

## 1. 数据加载

In [2]:
train = pd.read_csv("labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)

In [31]:
train.shape

(25000, 3)

In [34]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
id           25000 non-null object
sentiment    25000 non-null int64
review       25000 non-null object
dtypes: int64(1), object(2)
memory usage: 586.1+ KB


In [35]:
train.describe()

Unnamed: 0,sentiment
count,25000.0
mean,0.5
std,0.50001
min,0.0
25%,0.0
50%,0.5
75%,1.0
max,1.0


In [33]:
train.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [5]:
train['review'][0]

'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally

In [6]:
train.columns.values

array(['id', 'sentiment', 'review'], dtype=object)

## 2. 先拿一个文本进行处理测试

In [7]:
example = BeautifulSoup(train['review'][0]) # 去掉HTML特殊字符
example.get_text()

'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.The actual feature film bit when it finally starts is only on for 2

In [8]:
# 去掉标点符号和数字
letters_only = re.sub('[^a-zA-Z]', ' ', example.get_text()) # 对于不是字母的字符用空格代替，如果不用空格将会整个文本连成一串
letters_only

' With all this stuff going down at the moment with MJ i ve started listening to his music  watching the odd documentary here and there  watched The Wiz and watched Moonwalker again  Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent  Moonwalker is part biography  part feature film which i remember going to see at the cinema when it was originally released  Some of it has subtle messages about MJ s feeling towards the press and also the obvious message of drugs are bad m kay Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring  Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him The actual feature film bit when it finally starts is only on for    m

In [9]:
# 把评论换成小写
lower_case = letters_only.lower()

In [10]:
# 分成单独的单词
words = lower_case.split()

In [12]:
# 删除停用词
from nltk.corpus import stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [13]:
words = [w for w in words if not w in stopwords.words('english')]
len(words)

219

## 3. 测试结束，编写遍历处理函数

In [14]:
# 创建一个文本清理函数对整个DataFrame操作
# data是字符串类型
def review_to_words(raw_review):
    # 1.删除HTML
    review_HTML = BeautifulSoup(raw_review).get_text()
    # 2.删除非字母字符
    review_letters = re.sub("[^a-zA-Z]", ' ', review_HTML)
    # 3.lower(),分词
    # 因为不能对list做lower()和split(),所以stopwords=[]要放在最后
    review_split = review_letters.lower().split()
    
    # 4.stopwords
    # 将stopwordslist转换成set。因为我们将会调用这个函数成千上万次，所以它需要更快，而且在Python中搜索集合比搜索列表要快得多。
    stops = set(stopwords.words('english'))
    review_words = [w for w in review_split if not w in stops]
    
    # 把这些分完的词重新组合成一段
    return (' '.join(review_words)) #  join() 方法用于将序列(此处的序列为list)中的元素以指定的字符(空格)连接生成一个新的字符串。
    


## 4. 对一条数据测试函数

In [15]:
review_to_words(train['review'][0])

'stuff going moment mj started listening music watching odd documentary watched wiz watched moonwalker maybe want get certain insight guy thought really cool eighties maybe make mind whether guilty innocent moonwalker part biography part feature film remember going see cinema originally released subtle messages mj feeling towards press also obvious message drugs bad kay visually impressive course michael jackson unless remotely like mj anyway going hate find boring may call mj egotist consenting making movie mj fans would say made fans true really nice actual feature film bit finally starts minutes excluding smooth criminal sequence joe pesci convincing psychopathic powerful drug lord wants mj dead bad beyond mj overheard plans nah joe pesci character ranted wanted people know supplying drugs etc dunno maybe hates mj music lots cool things like mj turning car robot whole speed demon sequence also director must patience saint came filming kiddy bad sequence usually directors hate workin

## 5. 遍历处理所有数据

In [16]:
# 循环处理训练集所有数据
# 首先确定要处理的数据的数目
num_reviews = train['review'].size
print('Cleaning and parsing the training set movie reviews...\n')
# 初始化列表用于存储处理后的数据
clean_train_reviews = []

# 遍历
for i in range(num_reviews):
    
    # 等待一段冗长的代码运行是很烦人的。
    # 编写代码以便提供状态更新是很有帮助的。要让Python在每处理1000个评审之后打印一个状态更新，请尝试在上面的代码中添加一两行代码:
    if ((i+1) % 5000 == 0):
        
#         print('Review %d of %d\n ' % (i+1, num_reviews))
        print("已经处理{}条数据！\n".format(i+1))
        if((i+1) == 25000):
            print('处理结束！')
    
        
    clean_train_reviews.append(review_to_words(train['review'][i]))

Cleaning and parsing the training set movie reviews...

已经处理5000条数据！

已经处理10000条数据！

已经处理15000条数据！

已经处理20000条数据！

已经处理25000条数据！

处理结束！


## 6. 使用Bag of Words (Using scikit-learn)创建词向量

In [17]:
# 以本次数据集来说，一共有25000句话，共有8000(最大特征，自己创建)个词，故vectorizer.shape=(25000, 8000) 
vectorizer = CountVectorizer(analyzer = "word",   
                             tokenizer = None,    
                             preprocessor = None, 
                             stop_words = None,   
                             max_features = 8000) 
print(type(vectorizer))

<class 'sklearn.feature_extraction.text.CountVectorizer'>


In [18]:
train_data_features = vectorizer.fit_transform(clean_train_reviews)

In [19]:
train_data_features

<25000x8000 sparse matrix of type '<class 'numpy.int64'>'
	with 2119825 stored elements in Compressed Sparse Row format>

In [20]:
train_data_features = train_data_features.toarray()

In [21]:
train_data_features.shape

(25000, 8000)

In [22]:
sum(train_data_features[0])

190

In [23]:
# vocab为所有词汇组成的列表
vocab = vectorizer.get_feature_names()
type(vocab)
vocab[:10]

list

['aaron',
 'abandon',
 'abandoned',
 'abc',
 'abilities',
 'ability',
 'able',
 'abomination',
 'abortion',
 'abound']

In [24]:
import numpy as np

In [25]:
dist = np.sum(train_data_features, axis=0)
type(dist)
dist.shape
# zip() 函数用于将可迭代的对象作为参数，将对象中对应的元素打包成一个个元组，然后返回由这些元组组成的列表。
for tag, count in zip(vocab, dist):
    print(count, tag)

numpy.ndarray

(8000,)

48 aaron
51 abandon
187 abandoned
125 abc
108 abilities
454 ability
1259 able
42 abomination
50 abortion
51 abound
85 abraham
73 abrupt
60 abruptly
116 absence
83 absent
352 absolute
1485 absolutely
70 absorbed
57 absorbing
306 absurd
76 absurdity
44 abundance
192 abuse
77 abused
91 abusive
98 abysmal
297 academy
485 accent
203 accents
300 accept
130 acceptable
71 acceptance
144 accepted
57 accepting
74 accepts
92 access
60 accessible
318 accident
46 accidental
200 accidentally
41 acclaim
75 acclaimed
88 accompanied
45 accompanying
77 accomplish
124 accomplished
296 according
186 account
57 accounts
81 accuracy
284 accurate
65 accurately
123 accused
69 ace
179 achieve
139 achieved
124 achievement
45 achievements
54 achieves
90 acid
47 acknowledge
54 acquired
971 across
1251 act
658 acted
6490 acting
3354 action
311 actions
75 active
83 activities
63 activity
2389 actor
4486 actors
1219 actress
369 actresses
394 acts
793 actual
4237 actually
148 ad
302 adam
98 adams
43 adapt
453 adaptat

## 随机森林分类

In [27]:
print ("Training the random forest...")
from sklearn.ensemble import RandomForestClassifier


forest = RandomForestClassifier(n_estimators=100) 

forest = forest.fit(train_data_features, train["sentiment"])

Training the random forest...


In [28]:
# Read the test data
test = pd.read_csv("testData.tsv", 
                   header=0, 
                   delimiter="\t", 
                   quoting=3 )

# Verify that there are 25,000 rows and 2 columns
print (test.shape)

(25000, 2)


In [29]:
# Create an empty list and append the clean reviews one by one
num_reviews = len(test["review"])
clean_test_reviews = [] 

print ("Cleaning and parsing the test set movie reviews...\n")
for i in range(0, num_reviews):
    if((i+1) % 5000==0):
        print("Review %d of %d\n" % (i+1, num_reviews))
    clean_review = review_to_words( test["review"][i] )
    clean_test_reviews.append( clean_review )

Cleaning and parsing the test set movie reviews...

Review 5000 of 25000

Review 10000 of 25000

Review 15000 of 25000

Review 20000 of 25000

Review 25000 of 25000



In [30]:
# Get a bag of words for the test set, and convert to a numpy array
test_data_features = vectorizer.transform(clean_test_reviews)
test_data_features = test_data_features.toarray()

# Use the random forest to make sentiment label predictions
result = forest.predict(test_data_features)

# Copy the results to a pandas dataframe with an "id" column and
# a "sentiment" column
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )

# Use pandas to write the comma-separated output file
output.to_csv("Bag_of_Words_model.csv", index=False, quoting=3)