In [1]:
# Copyright (c) 2018. All rights reserved.
# Created by W.Y.Shen

import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC,SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier

## 0x00 加载数据

In [16]:
data = './dataset/'
dataset=pd.read_csv(f'{data}train-subset30.csv',sep='\t')
dataset.head()

Unnamed: 0,text,app,bus,calc,cinemas,contacts,cookbook,datetime,email,epg,...,riddle,schedule,stock,telephone,train,translation,tvchannel,video,weather,website
0,打开 熊猫 看 书,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,opera 浏览器,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,钛备份,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,打开 sky 电话,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,帮 我 找 菜单,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 0x01向量化

In [3]:
tv = TfidfVectorizer(ngram_range=(1,2))
X = tv.fit_transform(dataset['text'])
print("n_samples: %d, n_features(token): %d" % X.shape)

n_samples: 7226, n_features(token): 12992


In [4]:
label_cols=[i for i in dataset]
label_cols.remove('text')
y=[ np.where(1==row)[0][0] for idx,row in dataset[label_cols].iterrows()]

## 0x02 切分数据集

In [5]:
from sklearn.model_selection import train_test_split
x_train,x_val,y_train,y_val=train_test_split(X,y, test_size=0.2, random_state=118)

## 0x03 评价指标

In [6]:
def evaluation_result(actual, pred):
    print('f1-score:{0:.3f}'.format(metrics.f1_score(actual, pred,average='macro',labels=np.unique(pred))))
    print("accuracy:%0.3f" % metrics.accuracy_score(actual, pred)) 

## 0x04 分类器
- SVM(liblinear)
- LR
- DT
- SVC(libsvm)
- KNN

### (1) 线性SVM

In [7]:
%%time
#training
clf_1 = LinearSVC()
clf_1.fit(x_train, y_train)

CPU times: user 294 ms, sys: 418 µs, total: 294 ms
Wall time: 294 ms


In [8]:
#evalutating
pred_1 = clf_1.predict(x_val)
evaluation_result(y_val,pred_1)

f1-score:0.891
accuracy:0.890


### (2) 逻辑回归

In [None]:
%%time
#training
clf_2=LogisticRegression(C=4, dual=True)
clf_2.fit(x_train, y_train)

In [None]:
#evalutating
pred_2 = clf_2.predict(x_val)
evaluation_result(y_val,pred_2)

### (3) 决策树

In [None]:
%%time
#training
clf_3=DecisionTreeClassifier(random_state=118)
clf_3.fit(x_train, y_train)

In [None]:
#evalutating
pred_3 = clf_3.predict(x_val)
evaluation_result(y_val,pred_3)

### (4) 线性核SVC

In [None]:
%%time
#training
clf_4=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
clf_4.fit(x_train, y_train)

In [None]:
#evalutating
pred_4 = clf_4.predict(x_val)
evaluation_result(y_val,pred_4)

### (5) 朴素贝叶斯

In [None]:
%%time
clf_5=MultinomialNB()
clf_5.fit(x_train, y_train)

In [None]:
#evalutating
pred_5 = clf_5.predict(x_val)
evaluation_result(y_val,pred_5)

### (6) 最近邻

In [None]:
%%time
clf_6 = KNeighborsClassifier(n_neighbors=3)
clf_6.fit(x_train, y_train)

In [None]:
#evalutating
pred_6 = clf_6.predict(x_val)
evaluation_result(y_val,pred_6)

## 0x05 集成学习
- Soft Voting
- Bagging
- RF
- Adaboost
- GB

### (1) Soft Voting

In [None]:
%%time
clf_7=VotingClassifier(estimators=[
    ('lr_clf',LogisticRegression()),
    ('svm_clf',SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)),
    ('dt_clf',DecisionTreeClassifier(random_state=118)),
    ('nb_clf',MultinomialNB()),
    ('knn_clf',KNeighborsClassifier(n_neighbors=3))],
     voting='soft')
clf_7.fit(x_train, y_train)

In [None]:
#evalutating
pred_7 = clf_7.predict(x_val)
evaluation_result(y_val,pred_7)

### (2) Bagging

In [None]:
%%time
clf_8=BaggingClassifier(DecisionTreeClassifier(),
                       n_estimators=5000,#400　trees
                       max_samples=800,# #sample per tree
                       bootstrap=True,
                       n_jobs=-1,
                       oob_score=True,
                       max_features=10000,bootstrap_features=True)#随机采样特征
clf_8.fit(X, y)

In [None]:
#evalutating
pred_8 = clf_8.predict(x_val)
evaluation_result(y_val,pred_8)
clf_8.oob_score_

### (3) 随机森林

In [None]:
%%time
clf_9=RandomForestClassifier(n_estimators=500,random_state=118,oob_score=True,n_jobs=-1)
clf_9.fit(X,y)

In [None]:
#evalutating
pred_9 = clf_9.predict(x_val)
evaluation_result(y_val,pred_9)
clf_9.oob_score_

In [None]:
%%time
clf_10=ExtraTreesClassifier=ExtraTreesClassifier(n_estimators=500,bootstrap=True,oob_score=True)
clf_10.fit(X,y)

In [None]:
#evalutating
pred_10 = clf_10.predict(x_val)
evaluation_result(y_val,pred_10)
clf_10.oob_score_

### (4) Adaboost

In [None]:
%%time
clf_11=AdaBoostClassifier(DecisionTreeClassifier(max_depth=10),n_estimators=600)
clf_11.fit(x_train,y_train)

In [None]:
#evalutating
pred_11 = clf_11.predict(x_val)
evaluation_result(y_val,pred_11)

### (5) GradienBoost

In [None]:
%%time
clf_12=GradientBoostingClassifier(max_depth=6,n_estimators=60)
clf_12.fit(x_train,y_train)

In [None]:
#evalutating
pred_12 = clf_12.predict(x_val)
evaluation_result(y_val,pred_12)

## 0x06 在线测试

In [9]:
import jieba
def onlineTest(raw_query,classifier):
    pred=classifier.predict(tv.transform([' '.join([w for w in jieba.cut(raw_query)])]))
    print(label_cols[pred[0]])

In [10]:
onlineTest('帮我查一下从北京去上海的机票把',clf_1)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.627 seconds.
Prefix dict has been built succesfully.


flight
