In [2]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer as TFIDF
from bs4 import BeautifulSoup
from sklearn import metrics
from sklearn.ensemble import VotingClassifier

In [5]:
#### train data
train = pd.read_csv('train.csv', header=0, delimiter="\t", quoting=3)
id_train = train['id']
sentiment_train = train['sentiment']
reviews = train['review']

# 简单预处理
review_train = []
for review in reviews:
  # 去掉HTML标签，拿到内容
  review_text = BeautifulSoup(review,'lxml').get_text()
  # 用正则表达式取出符合规范的部分
  # review_text = re.sub("[^a-zA-Z]"," ", review_text)
  review_text = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", review_text)     
  review_text = re.sub(r"\'s", " \'s", review_text) 
  review_text = re.sub(r"\'ve", " \'ve", review_text) 
  review_text = re.sub(r"n\'t", " n\'t", review_text) 
  review_text = re.sub(r"\'re", " \'re", review_text) 
  review_text = re.sub(r"\'d", " \'d", review_text) 
  review_text = re.sub(r"\'ll", " \'ll", review_text) 
  review_text = re.sub(r",", " , ", review_text) 
  review_text = re.sub(r"!", " ! ", review_text) 
  review_text = re.sub(r"\(", " \( ", review_text) 
  review_text = re.sub(r"\)", " \) ", review_text) 
  review_text = re.sub(r"\?", " \? ", review_text) 
  review_text = re.sub(r"\s{2,}", " ", review_text)
  # 小写化所有的词，并转成词list
  words = review_text.lower().split()
  review_train.append(" ".join(words))

In [6]:
#### test data  
test = pd.read_csv('test.csv', header=0, delimiter="\t", quoting=3 )
id_test = test['id']
sentiment_test = test['sentiment']
reviews = test['review']

# 简单预处理
review_test = []
for review in reviews:
  # 去掉HTML标签，拿到内容
  review_text = BeautifulSoup(review,'lxml').get_text()
  # 用正则表达式取出符合规范的部分
  # review_text = re.sub("[^a-zA-Z]"," ", review_text)
  review_text = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", review_text)     
  review_text = re.sub(r"\'s", " \'s", review_text) 
  review_text = re.sub(r"\'ve", " \'ve", review_text) 
  review_text = re.sub(r"n\'t", " n\'t", review_text) 
  review_text = re.sub(r"\'re", " \'re", review_text) 
  review_text = re.sub(r"\'d", " \'d", review_text) 
  review_text = re.sub(r"\'ll", " \'ll", review_text) 
  review_text = re.sub(r",", " , ", review_text) 
  review_text = re.sub(r"!", " ! ", review_text) 
  review_text = re.sub(r"\(", " \( ", review_text) 
  review_text = re.sub(r"\)", " \) ", review_text) 
  review_text = re.sub(r"\?", " \? ", review_text) 
  review_text = re.sub(r"\s{2,}", " ", review_text)
  # 小写化所有的词，并转成词list
  words = review_text.lower().split()
  review_test.append(" ".join(words))

In [7]:
# 参考：http://blog.csdn.net/longxinchen_ml/article/details/50629613
tfidf = TFIDF(min_df=2, # 最小支持度为2
           max_df=0.5,
           max_features=None,
           strip_accents='unicode',
           analyzer='word',
           token_pattern=r'\w{1,}',
           ngram_range=(1, 3),  # 二元文法模型
           use_idf=1,
           smooth_idf=1,
           sublinear_tf=1)
           #stop_words = 'english') # 去掉英文停用词
 
# 合并训练和测试集以便进行TFIDF向量化操作
data_all = review_train + review_test
len_train = len(review_train)
 
tfidf.fit(data_all)
data_all = tfidf.transform(data_all)

# 恢复成训练集和测试集部分
train_x = data_all[:len_train]
test_x = data_all[len_train:]

In [74]:
from sklearn.linear_model import LogisticRegression as LR
from sklearn.linear_model import Ridge

## LR train
lr = LR()
lr.fit(train_x[:16000],sentiment_train[:16000])
predictions1=lr.predict_proba(train_x[16000:])[:,1]
print(metrics.roc_auc_score(sentiment_train[16000:],predictions1))

0.961242436434


In [78]:
from sklearn.linear_model import LogisticRegression as LR
from sklearn.linear_model import Ridge

## LR train
lr = LR()
lr.fit(train_x[:16000]*5,sentiment_train[:16000]*5)
predictions1=lr.predict_proba(train_x[16000:])[:,1]
print(metrics.roc_auc_score(sentiment_train[16000:],predictions1))

0.970984962009


In [81]:
## Rd train
Rd = Ridge(alpha=.3)

Rd.fit(train_x[:16000],sentiment_train[:16000])
predictions2=Rd.predict(train_x[16000:])
print(metrics.roc_auc_score(sentiment_train[16000:],predictions2))

0.972805293765


In [134]:
## Rd test
Rd = Ridge(alpha=.3)

Rd.fit(train_x,sentiment_train)
t_predictions2=Rd.predict(test_x)

In [108]:
## NB
from sklearn.naive_bayes import MultinomialNB as MNB
NB = MNB()
NB.fit(train_x[:16000]*10,sentiment_train[:16000]*10) #特征数据直接灌进来
MNB(alpha=1, class_prior=None, fit_prior=True)
predictions3=NB.predict_proba(train_x[16000:])[:,1]
print("NB", metrics.roc_auc_score(sentiment_train[16000:],predictions3))

NB 0.961958316903


In [135]:
## NB test
from sklearn.naive_bayes import MultinomialNB as MNB
NB = MNB()
NB.fit(train_x*10,sentiment_train*10) #特征数据直接灌进来
MNB(alpha=1, class_prior=None, fit_prior=True)
t_predictions3=NB.predict_proba(test_x)[:,1]

In [137]:
## KNN
from sklearn import neighbors
 
knn = neighbors.KNeighborsClassifier()
#训练数据集
knn.fit(train_x[:16000],sentiment_train[:16000])
predictions4=knn.predict_proba(train_x[16000:])[:,1]
print("knn", metrics.roc_auc_score(sentiment_train[16000:],predictions4))

knn 0.894703809769


In [138]:
## KNN test
from sklearn import neighbors
 
knn = neighbors.KNeighborsClassifier()
#训练数据集
knn.fit(train_x,sentiment_train)
t_predictions4=knn.predict_proba(test_x)[:,1]

In [90]:
predictions=(0.1*predictions1+2*predictions2+0.1*predictions3+0.4*predictions4)
print("ensemble", metrics.roc_auc_score(sentiment_train[16000:],predictions))

ensemble 0.975294997513


In [91]:
predictions=(2*predictions2+0.1*predictions3+0.4*predictions4)
print("ensemble", metrics.roc_auc_score(sentiment_train[16000:],predictions))

ensemble 0.97531325084


In [92]:
predictions=(2.3*predictions2+0.1*predictions3+0.4*predictions4)
print("ensemble", metrics.roc_auc_score(sentiment_train[16000:],predictions))

ensemble 0.975317001524


In [141]:
predictions=(2.3*predictions2+0.3*predictions3+0.4*predictions4)
print("ensemble", metrics.roc_auc_score(sentiment_train[16000:],predictions))

ensemble 0.975366260501


In [129]:
predictions=(5*predictions1+0.2*predictions3+0.4*predictions4)
print("ensemble", metrics.roc_auc_score(sentiment_train[16000:],predictions))

ensemble 0.973910495188


In [142]:
## write result
predictions=(2.3*t_predictions2+0.3*t_predictions3+0.4*t_predictions4)/3
data = {'id':id_test,'sentiment':predictions}
data_df = pd.DataFrame(data)
data_df.to_csv('lc_0717.csv',index=False)