In [3]:
import dataloader.path
import os
import re

from bs4 import BeautifulSoup
import pandas as pd

import nltk
from nltk.corpus import stopwords
nltk.data.path.append(dataloader.path.NLTK_DATA)

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV


train = pd.read_csv(os.path.join(dataloader.path.IMDB, 'labeledTrainData.tsv'), delimiter='\t')
test = pd.read_csv(os.path.join(dataloader.path.IMDB, 'testData.tsv'), delimiter='\t')

In [4]:
train.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [5]:
test.head()

Unnamed: 0,id,review
0,12311_10,Naturally in a film who's main themes are of m...
1,8348_2,This movie is a disaster within a disaster fil...
2,5828_4,"All in all, this is a movie for kids. We saw i..."
3,7186_2,Afraid of the Dark left me with the impression...
4,12128_7,A very accurate depiction of small time mob li...


In [6]:
def review_to_text(review, remove_stopwords):
    
    # 去除html标签
    raw_text = BeautifulSoup(review, 'html').get_text()
    
    # 去除非字母字符
    letters = re.sub('[^a-zA-z]', ' ', raw_text)
    words = letters.lower().split()
    
    # 去除停用词
    if remove_stopwords:
        stop_words = set(stopwords.words('english'))
        words = [w for w in words if w not in stop_words]
    
    # 返回处理完的列表
    return words

In [9]:
# 数据处理
x_train = list()
for review in train['review']:
    x_train.append(' '.join(review_to_text(review, True)))

x_test = list()
for review in test['review']:
    x_test.append(' '.join(review_to_text(review, True)))
    
y_train = train['sentiment']

# 搭建两组不同的贝叶斯分类器
pip_count = Pipeline([('count_vec', CountVectorizer(analyzer='word')), ('mnb', MultinomialNB())])
pip_tfidf = Pipeline([('tfidf_vec', TfidfVectorizer(analyzer='word')), ('mnb', MultinomialNB())])

# 配置超参数选择
params_count = {'count_vec__binary': [True, False], 
                'count_vec__ngram_range': [(1, 1), (1, 2)],
                'mnb__alpha': [0.1, 0.1, 10.0]}

params_tfidf = {'tfidf_vec__binary': [True, False], 
                'tfidf_vec__ngram_range': [(1, 1), (1, 2)],
                'mnb__alpha': [0.1, 0.1, 10.0]}

# 超参数搜索
gs_count = GridSearchCV(pip_count, params_count, cv=4, n_jobs=-1, verbose=1)
gs_count.fit(x_train, y_train)

print(gs_count.best_params_)
print(gs_count.best_score_)



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "html5lib")

  markup_type=markup_type))


Fitting 4 folds for each of 12 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  5.4min finished


{'count_vec__ngram_range': (1, 2), 'mnb__alpha': 10.0, 'count_vec__binary': True}
0.87796


In [12]:
count_y_predict=gs_count.predict(x_test)

submission_count=pd.DataFrame({'id': test['id'], 'sentiment': count_y_predict})

submission_count.to_csv('/Users/nemos/dataset/kaggle/IMDB/sub/sub_count.csv', index=False)