https://www.jianshu.com/p/29aa3ad63f9d
1. 特征向量化；
2. 朴素贝叶斯分类。

# 1.加载数据

In [2]:
import pandas as pd

In [3]:
train_df = pd.read_csv('../data/train.csv',encoding='utf-8')
test_df = pd.read_csv('../data/20190520_test.csv',encoding='utf-8')

In [4]:
print(train_df[train_df.isnull().values==True]) # find anomaly rows and fix by hand
# print(test_df.isnull().any())

Empty DataFrame
Columns: [ID, review, label]
Index: []


In [5]:
train_df.head()

Unnamed: 0,ID,review,label
0,1,Jo bhi ap se tou behtar hoon,Negative
1,2,ya Allah meri sister Affia ki madad farma,Positive
2,3,Yeh khud chahta a is umar main shadi krna. ha...,Negative
3,4,Tc ? Apky mun xe exe alfax achy nae lgty 😒💃,Negative
4,5,Good,Positive


In [6]:
test_df.head()

Unnamed: 0,ID,review
0,1,Phr tissuw se saaf
1,2,Jail Road Per Firing Se 1 Shakhs Janbahaq
2,3,mehfil loot li aunty ne
3,4,Rehnay do butt sahb nay galiya boht deni hain
4,5,Zabardast


In [7]:
train_df.shape

(6328, 3)

In [8]:
train_df['label'][:20]

0     Negative
1     Positive
2     Negative
3     Negative
4     Positive
5     Negative
6     Negative
7     Positive
8     Positive
9     Negative
10    Negative
11    Negative
12    Positive
13    Positive
14    Negative
15    Positive
16    Negative
17    Negative
18    Positive
19    Negative
Name: label, dtype: object

In [9]:
def encode_label(text):
    if(text=="Positive"):
        return 1
    else:
        return 0

In [10]:
train_df['encoded_label'] = train_df.label.apply(encode_label)

In [11]:
train_df['encoded_label'][:20]

0     0
1     1
2     0
3     0
4     1
5     0
6     0
7     1
8     1
9     0
10    0
11    0
12    1
13    1
14    0
15    1
16    0
17    0
18    1
19    0
Name: encoded_label, dtype: int64

In [12]:
x_train = train_df[['ID','review']]
y_train = train_df.encoded_label

In [13]:
x_test = test_df[['ID','review']]

# 2.分词 
**这里主要针对表情符号加空格隔开**

In [14]:
# https://gist.github.com/brendano/25521552453909400e2310b04f1b2ac9
JUNK_RE = (
    u'[' +
    u'\U00010000-\U0001ffff' +
    u'\U00030000-\U0010ffff' +
    u'\U0000e000-\U0000efff' +
    u'\U00002500-\U00002bff' +
    u'\U0000200B-\U0000200D' +
    u'\U0000fe0e-\U0000fe0f' +
    u'\u2026' +
    u'\u201c' +
    u'\u201d' +
    u']+')

In [15]:
import string
punc = string.punctuation
punc = punc.replace("-", "") # don't remove hyphens
print(punc)

!"#$%&'()*+,./:;<=>?@[\]^_`{|}~


In [16]:
import re
# 在标点符号、表情和正文文本之间加空格隔开，并且聚集在一起的标点符号不会隔开，比如:)
def add_space_to_punc_and_emoji(text):
  text = text.lower()
  text = re.sub( r'([a-zA-Z])(['+punc+'])', r'\1 \2',text)
  text = re.sub(r'(?='+JUNK_RE+r')', r" ", text)
  text = text[::-1]
  text = re.sub(r'([a-zA-Z])(['+punc+'])', r'\1 \2',text)
  text = re.sub(r'(?='+JUNK_RE+r')', r" ", text)
  text = text[::-1]
  return text

In [17]:
a="Tu aa to sae dekh kia kia sa??? pilati😂😎😋sss:)ss.…  "
b=add_space_to_punc_and_emoji(a)
print(b)

tu aa to sae dekh kia kia sa ??? pilati 😂  😎  😋 sss :) ss . …   


In [18]:
x_train['spaced_review'] = x_train.review.apply(add_space_to_punc_and_emoji)
x_test['spaced_review'] = x_test.review.apply(add_space_to_punc_and_emoji)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [19]:
x_train['spaced_review'][:5]

0                         jo bhi ap se tou behtar hoon
1            ya allah meri sister affia ki madad farma
2    yeh khud chahta a is umar main shadi krna .  h...
3      tc ? apky mun xe exe alfax achy nae lgty  😒  💃 
4                                                 good
Name: spaced_review, dtype: object

In [20]:
x_test['spaced_review'][:5]

0                                phr tissuw se saaf
1        jail road per firing se 1 shakhs janbahaq 
2                           mehfil loot li aunty ne
3    rehnay do butt sahb nay galiya boht deni hain 
4                                        zabardast 
Name: spaced_review, dtype: object

# 3. 特征向量化 （词频矩阵）

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

In [22]:
max_df = 0.6 # 在超过这一比例的文档中出现的关键词（过于平凡），去除掉。 
# max_df这一项设置小一点也不影响，因为很小概率一个词在超过甚至一般的句子中出现。

min_df = 4 # 在低于这一数量的文档中出现的关键词（过于独特），去除掉。
# min_df这一项设置大一些对最终留下的词影响挺大的。说明现在这6000多句小样本每个词的重复出现率并不高

In [23]:
JUNK_RE = (
    u'[' +
    u'\U00010000-\U0001ffff' +
    u'\U00030000-\U0010ffff' +
    u'\U0000e000-\U0000efff' +
    u'\U00002500-\U00002bff' +
    u'\U0000200B-\U0000200D' +
    u'\U0000fe0e-\U0000fe0f' +
    u']+')
punc = string.punctuation
punc = punc.replace("-", "") # don't remove hyphens

In [24]:
vect = CountVectorizer(max_df = max_df, 
                       min_df = min_df, 
                       token_pattern=u'(?u)\\b[^\\d\\W]\\w+\\b')

In [25]:
#vect = CountVectorizer(max_df = max_df, 
#                       min_df = min_df, 
#                       token_pattern=u'(?u)\\b[^\\d\\W]\\w+\\b|'+JUNK_RE)
# 考虑加入表情作为特征反而准确率还不如不加

(?u): UNICODE_CHARACTER_CLASS
\\b word boundary
\\w word


In [26]:
# vect = CountVectorizer(strip_accents=None,stop_words=None,max_df = max_df, min_df = min_df )

In [27]:
term_matrix = pd.DataFrame(vect.fit_transform(x_train.spaced_review).toarray(), columns=vect.get_feature_names())

In [28]:
x_train.spaced_review[3]

'tc ? apky mun xe exe alfax achy nae lgty  😒  💃 '

In [29]:
a=term_matrix.iloc[[3]]
for i in range(a.shape[1]):
    if(a.iloc[0][i]!=0):
        print(term_matrix.columns[i])

achy
apky
mun
nae


In [30]:
vect.get_feature_names()[-30:]

['zarurat',
 'zarye',
 'zati',
 'zaya',
 'zayada',
 'zealand',
 'zehan',
 'zehar',
 'zehni',
 'zehr',
 'zia',
 'ziada',
 'zikar',
 'zimedari',
 'zinda',
 'zindabad',
 'zindage',
 'zindagi',
 'zindah',
 'zindge',
 'zindgi',
 'ziyada',
 'ziyadah',
 'zor',
 'zra',
 'zuban',
 'zulfiqar',
 'zulm',
 'zyada',
 'zyda']

In [31]:
len(vect.get_feature_names())

3268

# 4. 贝叶斯分类

In [34]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [35]:
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(vect, nb)

In [36]:
pipe.steps

[('countvectorizer',
  CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                  dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                  lowercase=True, max_df=0.6, max_features=None, min_df=4,
                  ngram_range=(1, 1), preprocessor=None, stop_words=None,
                  strip_accents=None, token_pattern='(?u)\\b[^\\d\\W]\\w+\\b',
                  tokenizer=None, vocabulary=None)),
 ('multinomialnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]

In [37]:
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, x_train.spaced_review, y_train, cv=5, scoring='accuracy').mean()

0.7534792594956325

In [38]:
pipe.fit(x_train.spaced_review, y_train)
y_pred = pipe.predict(x_test.spaced_review)
y_pred_proba = pipe.predict_proba(x_test.spaced_review)

In [39]:
y_pred[350:400]

array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1], dtype=int64)

In [40]:
y_pred_proba[:10]

array([[6.23809343e-01, 3.76190657e-01],
       [9.99980236e-01, 1.97639265e-05],
       [7.64355000e-01, 2.35645000e-01],
       [8.82803768e-01, 1.17196232e-01],
       [1.75912964e-01, 8.24087036e-01],
       [9.77465059e-01, 2.25349412e-02],
       [9.40632444e-01, 5.93675565e-02],
       [6.81331150e-01, 3.18668850e-01],
       [8.68808384e-01, 1.31191616e-01],
       [9.75741898e-01, 2.42581019e-02]])

In [41]:
df = pd.DataFrame(zip(x_test.ID,y_pred_proba[:,1]), columns=["ID","Pred"])
df.to_csv('result.csv', index=False, float_format='%.6f')

# 5.随机森林
https://blog.csdn.net/u010665216/article/details/78741159

In [42]:
print("Training the random forest...")
from sklearn.ensemble import RandomForestClassifier

Training the random forest...


In [43]:
forest = RandomForestClassifier(n_estimators = 100) 
pipe = make_pipeline(vect, forest)
pipe.steps

[('countvectorizer',
  CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                  dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                  lowercase=True, max_df=0.6, max_features=None, min_df=4,
                  ngram_range=(1, 1), preprocessor=None, stop_words=None,
                  strip_accents=None, token_pattern='(?u)\\b[^\\d\\W]\\w+\\b',
                  tokenizer=None, vocabulary=None)),
 ('randomforestclassifier',
  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                         max_depth=None, max_features='auto', max_leaf_nodes=None,
                         min_impurity_decrease=0.0, min_impurity_split=None,
                         min_samples_leaf=1, min_samples_split=2,
                         min_weight_fraction_leaf=0.0, n_estimators=100,
                         n_jobs=None, oob_score=False, random_state=None,
                         verbose=0, warm_start=False))]

In [44]:
cross_val_score(pipe, x_train.spaced_review, y_train, cv=5, scoring='accuracy').mean()

0.7269242351130754

In [45]:
pipe.fit(x_train.spaced_review, y_train)
y_pred = pipe.predict(x_test.spaced_review)
y_pred_proba = pipe.predict_proba(x_test.spaced_review)

In [46]:
y_pred_proba[:10]

array([[0.86744186, 0.13255814],
       [0.98      , 0.02      ],
       [0.68707394, 0.31292606],
       [0.42116667, 0.57883333],
       [0.00744186, 0.99255814],
       [0.6       , 0.4       ],
       [0.46      , 0.54      ],
       [0.66091125, 0.33908875],
       [0.57      , 0.43      ],
       [0.78      , 0.22      ]])

# 6. TfidfVectorizer
vectorizer = CountVectorizer() #构建一个计算词频（TF）的玩意儿，当然这里面不足是可以做这些

transformer = TfidfTransformer() #构建一个计算TF-IDF的玩意儿

tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))

#vectorizer.fit_transform(corpus)将文本corpus输入，得到词频矩阵

#将这个矩阵作为输入，用transformer.fit_transform(词频矩阵)得到TF-IDF权重矩阵

https://www.jianshu.com/p/c7e2771eccaa

In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

In [48]:
corpus = [
...     'This is the first document.',
...     'This document is the second document.',
...     'And this is the third one.',
...     'Is this the first document?',
... ]
# X = vectorizer.fit_transform(corpus)
tfidf_model = TfidfVectorizer().fit(corpus)
X = tfidf_model.transform(corpus)

In [49]:
print(tfidf_model.get_feature_names())

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


In [50]:
print(X.shape)

(4, 9)


In [51]:
X_dense = X.todense()
print(X_dense)   

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


In [52]:
print(tfidf_model.vocabulary_)                      # 词语与列的对应关系

{'this': 8, 'is': 3, 'the': 6, 'first': 2, 'document': 1, 'second': 5, 'and': 0, 'third': 7, 'one': 4}


## 正式用在这里

In [76]:
tf_vect = TfidfVectorizer(min_df=2, max_features = None,
                       token_pattern=u'(?u)\\b[^\\d\\W]\\w+\\b')

In [77]:
tf_term_matrix = pd.DataFrame(tf_vect.fit_transform(x_train.spaced_review).toarray(), columns=tf_vect.get_feature_names())

In [78]:
tf_term_matrix.head()

Unnamed: 0,a1,aa,aaaa,aaap,aaaya,aabad,aadmi,aae,aaega,aafia,...,zorr,zra,zror,zrort,zuban,zulam,zulfiqar,zulm,zyada,zyda
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [79]:
a=tf_term_matrix.iloc[[3]]
for i in range(a.shape[1]):
    if(a.iloc[0][i]!=0):
        print(tf_term_matrix.columns[i],a.iloc[0][i])

achy 0.31489548800592204
apky 0.346164723487749
exe 0.39041186426831476
lgty 0.37743395896957604
mun 0.346164723487749
nae 0.2672548704308224
tc 0.39041186426831476
xe 0.37743395896957604


In [80]:
len(tf_vect.get_feature_names())

6823

In [81]:
nb = MultinomialNB()
pipe = make_pipeline(tf_vect, nb)
pipe.steps

[('tfidfvectorizer',
  TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                  dtype=<class 'numpy.float64'>, encoding='utf-8',
                  input='content', lowercase=True, max_df=1.0, max_features=None,
                  min_df=2, ngram_range=(1, 1), norm='l2', preprocessor=None,
                  smooth_idf=True, stop_words=None, strip_accents=None,
                  sublinear_tf=False, token_pattern='(?u)\\b[^\\d\\W]\\w+\\b',
                  tokenizer=None, use_idf=True, vocabulary=None)),
 ('multinomialnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]

In [82]:
cross_val_score(pipe, x_train.spaced_review, y_train, cv=5, scoring='roc_auc').mean()

0.8511839628293603

In [83]:
pipe.fit(x_train.spaced_review, y_train)
y_pred = pipe.predict(x_test.spaced_review)
y_pred_proba = pipe.predict_proba(x_test.spaced_review)

In [84]:
y_pred_proba[:10]

array([[0.6225683 , 0.3774317 ],
       [0.96914319, 0.03085681],
       [0.5080556 , 0.4919444 ],
       [0.58508193, 0.41491807],
       [0.12669064, 0.87330936],
       [0.83735578, 0.16264422],
       [0.43919091, 0.56080909],
       [0.54471072, 0.45528928],
       [0.45319455, 0.54680545],
       [0.66184704, 0.33815296]])

In [85]:
df = pd.DataFrame(zip(x_test.ID,y_pred_proba[:,1]), columns=["ID","Pred"])
df.to_csv('result.csv', index=False, float_format='%.6f')