In [7]:
import pandas as pd
train_df = pd.read_csv('../data/train.csv',encoding='utf-8')
test_df = pd.read_csv('../data/20190527_test.csv',encoding='utf-8')

print(train_df[train_df.isnull().values==True]) # find anomaly rows and fix by hand

print(train_df.head())
print(test_df.head())

print(train_df.shape)
print(train_df['label'][:10])

Empty DataFrame
Columns: [ID, review, label]
Index: []
   ID                                             review     label
0   1                       Jo bhi ap se tou behtar hoon  Negative
1   2          ya Allah meri sister Affia ki madad farma  Positive
2   3  Yeh khud chahta a is umar main shadi krna.  ha...  Negative
3   4        Tc ? Apky mun xe exe alfax achy nae lgty 😒💃  Negative
4   5                                               Good  Positive
   ID                                             review
0   1                         masha allah ache cheez hai
1   2  Wazir e Mumlikat Saira Afzal K Walid Ko Shikas...
2   3                          SelfieKing Ban Gia Dulha 
3   4  Buhat he ache quality ke product hay.... i lov...
4   5  Hahahah :p naam letaa tu ziada ddoubt hootaa m...
(6328, 3)
0    Negative
1    Positive
2    Negative
3    Negative
4    Positive
5    Negative
6    Negative
7    Positive
8    Positive
9    Negative
Name: label, dtype: object


In [9]:
def encode_label(text):
    if(text=="Positive"):
        return 1
    else:
        return 0

In [11]:
train_df['encoded_label'] = train_df.label.apply(encode_label)
train_df['encoded_label'][:10]

0    0
1    1
2    0
3    0
4    1
5    0
6    0
7    1
8    1
9    0
Name: encoded_label, dtype: int64

In [12]:
x_train = train_df[['ID','review']]
y_train = train_df.encoded_label
x_test = test_df[['ID','review']]

In [16]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
def review_to_wordlist(review):
    '''
    把IMDB的评论转成词序列
    参考：http://blog.csdn.net/longxinchen_ml/article/details/50629613
    '''
    # 去掉HTML标签，拿到内容
    review_text = BeautifulSoup(review, "html.parser").get_text()
    # 用正则表达式取出符合规范的部分
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    # 小写化所有的词，并转成词list
    words = review_text.lower().split()
    # 
    new_review = " ".join(words)
    return new_review

In [17]:
a="Tu aa to sae dekh kia kia sa??? pilati😂😎😋sss:)ss.…  "
b=review_to_wordlist(a)
print(b)

tu aa to sae dekh kia kia sa pilati sss ss


In [18]:
x_train['spaced_review'] = x_train.review.apply(review_to_wordlist)
x_test['spaced_review'] = x_test.review.apply(review_to_wordlist)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [19]:
x_train['spaced_review'][:5]

0                         jo bhi ap se tou behtar hoon
1            ya allah meri sister affia ki madad farma
2    yeh khud chahta a is umar main shadi krna had ogi
3               tc apky mun xe exe alfax achy nae lgty
4                                                 good
Name: spaced_review, dtype: object

In [20]:
x_test['spaced_review'][:5]

0                           masha allah ache cheez hai
1    wazir e mumlikat saira afzal k walid ko shikas...
2                             selfieking ban gia dulha
3    buhat he ache quality ke product hay i love da...
4    hahahah p naam letaa tu ziada ddoubt hootaa ma...
Name: spaced_review, dtype: object

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=2, # 最小支持度为2
           max_features=None,
           strip_accents='unicode',
           analyzer='word',
           token_pattern=r'\w{1,}',
           ngram_range=(1, 3),  # 二元文法模型
           use_idf=1,
           smooth_idf=1,
           sublinear_tf=1,
           stop_words = 'english') # 去掉英文停用词

In [22]:
data_all = x_train['spaced_review'].tolist() + x_test['spaced_review'].tolist()
print(len(data_all))

In [24]:
vectorizer.fit(data_all)
data_all = vectorizer.transform(data_all)
len_train = len(x_train['spaced_review'])
print(len_train)
# 恢复成训练集和测试集部分
train_x = data_all[:len_train]
test_x = data_all[len_train:]
print ('TF-IDF处理结束.')

6328
TF-IDF处理结束.


In [26]:
from sklearn.naive_bayes import MultinomialNB as MNB

model_NB = MNB()
model_NB.fit(train_x, train_df.encoded_label)
MNB(alpha=1.0, class_prior=None, fit_prior=True)

from sklearn.model_selection import cross_val_score
import numpy as np

print ("多项式贝叶斯分类器10折交叉验证得分: ", np.mean(cross_val_score(model_NB, train_x, train_df.encoded_label, cv=10, scoring='roc_auc')))

多项式贝叶斯分类器10折交叉验证得分:  0.8631329048865058


In [27]:
test_predicted = np.array(model_NB.predict_proba(test_x))

In [28]:
print(test_predicted[:10])

[[0.13321244 0.86678756]
 [0.31632954 0.68367046]
 [0.65443442 0.34556558]
 [0.10934261 0.89065739]
 [0.62783136 0.37216864]
 [0.31400403 0.68599597]
 [0.21524792 0.78475208]
 [0.45230454 0.54769546]
 [0.33818958 0.66181042]
 [0.52465364 0.47534636]]


In [30]:
df = pd.DataFrame(zip(x_test.ID,test_predicted[:,1]), columns=["ID","Pred"])
df.to_csv('result.csv', index=False, float_format='%.6f')