## 原理
### 文本表示
n-gram + BoW
### 分类器
NBSVM是Sida Wang 和 Chris Manning 在其论文 [Baselines and Bigrams: Simple, Good Sentiment and Topic Classiﬁcation](https://nlp.stanford.edu/pubs/sidaw12_simple_sentiment.pdf)中提出的. 由于在实践中，svm和逻辑回归十分接近，本文直接使用逻辑回归代替SVM。
If you're not familiar with naive bayes and bag of words matrices, I've made a preview available of one of fast.ai's upcoming *Practical Machine Learning* course videos, which introduces this topic. Here is a link to the section of the video which discusses this: [Naive Bayes video](https://youtu.be/37sFIak42Sc?t=3745).

In [None]:
#-*- coding:utf-8 -*-
from __future__ import print_function
import logging
from optparse import OptionParser
import sys
from time import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.utils.extmath import density
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn import datasets
from sklearn import metrics
import os
import jieba
from sklearn.preprocessing import scale
import codecs
import pandas as pd
from sklearn.externals import joblib
from scipy.sparse import csr_matrix, hstack
import scipy
import pickle
import json
from matplotlib import pyplot
import matplotlib as mpl
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.externals import joblib
import sys
import os
import pandas as pd
import argparse
from sklearn.pipeline import Pipeline
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
import six
from abc import ABCMeta
from scipy import sparse
from scipy.sparse import issparse
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils import check_X_y, check_array
from sklearn.utils.extmath import safe_sparse_dot
from sklearn.preprocessing import normalize, binarize, LabelBinarizer
from sklearn.svm import LinearSVC
from sklearn.feature_extraction import text
from classifiers import NBSVM

In [None]:
data = '../data/'
pkls='../models/'
if not os.path.exists(pkls):
    os.mkdir(pkls)
stopwords_path='../utils/stopword_2792.txt'

# 1 加载数据

In [None]:
tra_contents=[]
# tra_ids=[]
tra_labels=[]
with open(f'{data}train.json','r') as tra_f:
    for idx,each in enumerate(tra_f):
        samp=json.loads(each.strip())
        label=samp['label']#标签
        content=samp['text']#内容

        tra_labels.append(label)
        tra_contents.append(content)

train_set={'content':tra_contents,
           'label':tra_labels}

train_df=pd.DataFrame(train_set)
print('Trainset Loaded')


val_contents=[]
# val_ids=[]
val_labels=[]
with open(f'{data}devel.json','r') as val_f:
    for idx,each in enumerate(val_f):
        samp=json.loads(each.strip())
        label=samp['label']#标签
        content=samp['text']#内容

        val_labels.append(label)
        val_contents.append(content)

val_set={'content':val_contents,
           'label':val_labels}

val_df=pd.DataFrame(val_set)
print('Val-set Loaded')


### 向量化

In [None]:
%%time
#停用词
with open(stopwords_path,'r') as stw:
    stopwords=[x.strip() for x in stw]
my_stop_words = text.ENGLISH_STOP_WORDS.union(stopwords)
del stopwords

x_train = train_df['content']
y_train= pd.Series(train_df['label'])
x_val = val_df['content']
y_val= pd.Series(val_df['label'])
x_dataset=pd.concat([x_train,x_val],axis=0,ignore_index=True)

word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
#     strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words=my_stop_words,
    ngram_range=(1, 2))
word_vectorizer.fit(x_dataset)
x_wd = word_vectorizer.transform(x_train)
joblib.dump(word_vectorizer,f'{pkls}nbsvm-vocab-wd.pkl')

# 使用字符特征
char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
#     strip_accents='unicode',
    stop_words=my_stop_words,
    analyzer='char',
    ngram_range=(1, 3))
char_vectorizer.fit(x_dataset)
# char_vectorizer=joblib.load(f'{pkls}vocab-ch.pkl')
x_ch = char_vectorizer.transform(x_train)

joblib.dump(char_vectorizer,f'{pkls}nbsvm-vocab-ch.pkl')
x_train = hstack([x_ch, x_wd])
del x_ch
del x_wd
x_train=scipy.sparse.csr_matrix(x_train)
print('training data vectorized')


x_val_wd = word_vectorizer.transform(x_val)

x_val_ch = char_vectorizer.transform(x_val)
x_val = hstack([x_val_ch, x_val_wd])
del x_val_ch
del x_val_wd
x_val=scipy.sparse.csr_matrix(x_val)
print('develope data vectorized')

# 2 训练模型

In [None]:
print('loading...')
clf = NBSVM()

ch2 = SelectKBest(chi2, k=4000)
x_train = ch2.fit_transform(x_train,tra_labels)
print('fitting...')
clf.fit(x_train,tra_labels)


In [None]:
# 模型持久化
joblib.dump(clf,f'{pkls}nbsvm_31.pkl')

In [None]:
joblib.dump(ch2,f'{pkls}nbsvm-feature_selector.pkl')

### 评价方式

In [None]:
def evaluation_result(actual, pred):
    print('predict info:')
    print('f1-score:{0:.3f}'.format(metrics.f1_score(actual, pred,average='macro',labels=np.unique(pred))))
    print("accuracy:   %0.3f" % metrics.accuracy_score(actual, pred))
    print(metrics.classification_report(actual,pred,target_names=list(set(tra_labels))))

### 在开发集上测试

In [None]:
print('testing...')
x_val = ch2.transform(x_val)#特征选择
pred = clf.predict(x_val)

evaluation_result(y_val,pred)


# 3 在线测试

In [None]:
%%time

from sklearn.externals import joblib
import pickle
import numpy as np
import jieba
from scipy.sparse import csr_matrix, hstack
from classifiers import NBSVM

pkls='../models/'
        
def onlineTest(raw_query):
    text = [''.join([w for w in jieba.cut(raw_query)])]
    char_vectorizer_=joblib.load(f'{pkls}nbsvm-vocab-ch.pkl')
    test_ch = char_vectorizer_.transform(text)
    word_vectorizer_=joblib.load(f'{pkls}nbsvm-vocab-wd.pkl')
    test_wd = word_vectorizer_.transform(text)
    test_vec=hstack([test_ch, test_wd])
    test_vec=csr_matrix(test_vec)

    clf_=NBSVM()
    clf_=joblib.load(f'{pkls}nbsvm_31.pkl')
    ch2_=joblib.load(f'{pkls}nbsvm-feature_selector.pkl')
    test_vec = ch2_.transform(test_vec)
    pred=clf_.predict(test_vec)
    return pred.tolist()[0]

In [None]:
onlineTest('我想听一首王源的做我自己')