In [2]:
import pandas as pd
import jieba
import codecs
import numpy as np
from gensim import models
# 保存/导入model
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

cut_all = models.Word2Vec.load('./model/cut_all.model')
model = models.Word2Vec.load('./model/finance.model')
model2 = models.Word2Vec.load('./model/finance2.model')

In [2]:
stop_words = ''.join(codecs.open('./input/stop_words.txt', encoding='utf-8').readlines()).split('\n')

In [4]:
def tfidf_processing(sentence, label, stop_words=[], test_size=0.1):
    X_train, X_test, y_train, y_test = train_test_split(sentence, label, test_size=test_size, random_state=33)
    s = pd.Series(X_test)
    s.to_csv('./X_test.csv', index=False)
    X_train = [' '.join(jieba.cut(x_train)) for x_train in X_train]
    X_test = [' '.join(jieba.cut(x_test)) for x_test in X_test]
    tfidf = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b")
    X_train = tfidf.fit_transform(X_train)
    X_test = tfidf.transform(X_test)
    joblib.dump(tfidf, './model/tfidf_model.pkl', compress=3)
    return X_train, X_test, y_train, y_test

In [9]:
def w2v_processing(sentence, label, test_size=0.1):
    X_train, X_test, y_train, y_test = train_test_split(sentence, label, test_size=test_size, random_state=33)
    X_train = np.array([get_vec(str(s), model) for s in X_train])
    X_test = np.array([get_vec(str(s), model) for s in X_test])
    print('w2v done...')
    return X_train, X_test, y_train, y_test

In [10]:
def get_vec(sentence, model):
    vec = np.zeros(model.vector_size)
    n = 0
    for word in jieba.cut(sentence):
        try:
            vec += model.wv[word]
            n += 1
        except KeyError:
            pass
    return vec if n==0 else vec/n 

# 需要加异常处理
def sentence_vector(s, model):
        words = jieba.lcut(s)
        v = np.zeros(100)
        for word in words:
            v += model[word]
        v /= len(words)
        return v

In [11]:
def random_forest(X_train, X_test, y_train, y_test):
    print("训练样本 = %d" % len(y_train))
    print("测试样本 = %d" % len(y_test))

    clf = RandomForestClassifier(n_estimators=2000, max_depth=80 , n_jobs=6)  
    nn = clf.fit(X_train, y_train)
    joblib.dump(nn, './model/random_forest.pkl', compress=3)
    y_predict = clf.predict(X_test)
    
    predict = pd.DataFrame({'y_test': y_test, 'y_predict': y_predict})
    return nn, predict

In [None]:
def svm(X_train, X_test, y_train, y_test):
    print("训练样本 = %d" % len(y_train))
    print("测试样本 = %d" % len(y_test))
    
    clf = 

In [12]:
def metrics_(predict):
    y_true = predict['y_test']
    y_pred = predict['y_predict']
    pre = precision_score(y_true, y_pred, average='binary')
    recall = recall_score(y_true, y_pred, average='binary')
    print('-' * 50)
    print('准确率：' + str(pre) + '\n召回率：' + str(recall))

# 数据预处理

In [19]:
df = pd.read_csv(r'G:\BaiduNetdiskDownload/financezhidao_filter.csv').fillna('')

In [28]:
df.sample(5)

Unnamed: 0,title,question,reply,is_best
24238,2018年我是湖*人在广*有住房公积金可以到重*买房吗?,2018年我是湖*人在广*有住房公积金可以到重*买房吗?,各地公积金政策有所不同，建议通过官网查询或者咨询当地公积金管理中心，官方电话是12#29.,1
72161,信用度不好怎么才能贷款,,贷多钱~~？,0
92528,我在网上申请小额贷款为什么总是被拒绝,,想贷多少,0
73572,"我有固定工作,有工资卡,身份证,如何快速借款500元?",,qq微信淘宝都可以,0
603939,网上小额贷款可靠吗？,,正规的会越来越多，必要的时候你多问问身边的人。,0


In [13]:
df = pd.read_csv('./input/corpus.csv').fillna('')
l = list((df.topic.value_counts() > 5).to_dict().keys())
df = df[df['topic'].isin(l)]

In [107]:
df[df['topic'] == '电影']

Unnamed: 0,answer_id,answerer_tags,content,desc,qid,star,title,topic
5,39769445,,美漫里除了有滥情或者花少属性的个别人物之外，大都cp是比较稳定的，很少性生活混乱的情况。因为...,,28163188,3,为什么美漫里的超级英雄大都性生活混乱？,电影
190,34382979,,我记得有人写过一个，回去发链接。事先说下，我只是转载的，喜欢的同学可以点最后的链接，有更完g...,题目中的“有可能”，是一种不考虑市场时代国情等现实条件，仅考虑逻辑上的抽象的可能性，比如假设...,26850595,844,《葫芦娃》有可能拍成《复仇者联盟》一样宏大好看的全年龄向真人电影吗？,电影
208,95913044,热爱这个丑陋世界的人才是真善美,肖恩宾 Sean Bean，绰号人形自走剧透机，一直在死的男人，因为他演的大部分角色都是配角...,正如前几天大火的问题，又有哪些人物一出场就有必死便当flag或者吾有上将潘凤的赶脚？,42822450,63,有哪些人物一出场你就感觉「不妙」「不好的预感」「flag」？,电影
225,68841371,前端程序员 Web开发者 吉他爱好者 个人微信号:posebear1990 微信公众号: i...,卢正雨 补充一点： 说实话，周星驰的牛逼之处在于，周星驰之后是绝对是再无周星驰的。而且，我说...,从小就很喜欢星爷，他的无厘头在我看来无人能及（个人观点，不喜勿喷）,36732589,7,周星驰的的无厘头的搞笑风格，会有人传承下去吗？,电影
237,73250275,膜力宝贝,糟点太多了，抓到邦德一枪蹦了，什么BUG都没有了。,幽灵党中感觉bug好多比如打灰机啦还有哪些bug或不符合逻辑之处呢？,37717683,10,电影《007幽灵党》中有哪些bug？,电影
365,92706006,地理，经济，麻类植物爱好者，研究人员,有一个关于大麻的彩蛋很有意思。yax在电影中是一只崇尚自然主义的动物，而扮演者不是别人，正是...,,41035200,4,《疯狂动物城》（Zootopia）中有哪些有趣的细节？,电影
412,56364331,为lgbt平权，男女平权而努力,有个简单评判办法关掉声音，没有了配音和背景乐，演员的口型好看不？眼精有神不？他她是用什么来演...,作为一个普普通通的观众，我有的时候真的分不出来某个演员到底有没有演技。<br>比如大家说某演...,32328952,4,什么是演技？普通观众如何甄别演技优劣？,电影
484,115698780,一个不知道怎么写自我介绍的共产主义接班人,以下内容涉及毫无看点的剧透，且仅代表个人观点 看完夏有乔木雅望天堂了我快要吐血身亡了感觉身体...,,49263059,19,如何评价电影夏有乔木雅望天堂？,电影
599,135981902,典型水瓶座,应该是动之以情，晓之以理。泷属于暴力强迫型，男人之间碰撞，当然是硬碰硬不欢而散。而作为柔软的...,为什么三葉本人可以说服她的父亲，泷却无法说服他的岳父？<br><br>回答不限于原电影故事逻...,51266154,3,《你的名字。》中三葉最后怎么劝说她爸爸的？,电影
623,126666160,东叶寺 鞭巨,从不敢奢求能完全理解一部电影，严肃脸 因为一旦提到理解，就是一件多维度多因素的事情。从内在的...,,21620122,37,怎样才叫理解一部电影？,电影


In [7]:
df_ = df.sample(n=50000)

In [8]:
sentence = df_.title.tolist()
label = [1 for i in range(df_.shape[0])]
sentence.extend(df_.reply.tolist())
label.extend([0 for i in range(df_.shape[0])])

In [13]:
df = pd.read_csv('./cut_all.csv')

In [15]:
sentence = []
sentence.extend(df[df['label'] == 0].sample(10000).review.tolist())
sentence.extend(df[df['label'] == 1].sample(10000).review.tolist())
sentence.extend(df[df['label'] == 2].sample(10000).review.tolist())
sentence.extend(df[df['label'] == 3].sample(10000).review.tolist())

In [17]:
label = []
label.extend([0 for i in range(10000)])
label.extend([1 for i in range(10000)])
label.extend([2 for i in range(10000)])
label.extend([3 for i in range(10000)])

# 训练模型

In [9]:
# X_train, X_test, y_train, y_test = tfidf_processing(sentence, label)

In [19]:
%%time
X_train, X_test, y_train, y_test = w2v_processing(sentence, label)
nn, predict = random_forest(X_train, X_test, y_train, y_test)
# joblib.dump(nn, './model/random_forest.pkl', compress=3)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\Zn\AppData\Local\Temp\jieba.cache
Loading model cost 0.554 seconds.
Prefix dict has been built succesfully.


w2v done...
训练样本 = 36000
测试样本 = 4000
Wall time: 3min 1s


In [24]:
predict

Unnamed: 0,y_predict,y_test
0,3,0
1,2,3
2,0,2
3,0,1
4,0,2
5,2,3
6,2,2
7,1,0
8,1,0
9,2,3


In [4]:
tfidf_model = joblib.load('./model/tfidf_model.pkl')
random_forest_model = joblib.load('./model/random_forest.pkl')

# 测试模型

In [34]:
tf = [np.array(get_vec('以此纪念九一八！宜黄县政府网站。', cut_all))]
nn.predict_proba(tf), nn.predict(tf)

(array([[0.24724805, 0.2681345 , 0.24890268, 0.23571477]]), array([1]))

In [35]:
random_forest_model.predict_proba(np.array([sentence_vector('a ', model)]))

  app.launch_new_instance()


array([[0.375, 0.625]])

In [6]:
model.wv.vocab

{'招行': <gensim.models.keyedvectors.Vocab at 0x1d1afba4940>,
 '积分换': <gensim.models.keyedvectors.Vocab at 0x1d1afba4ac8>,
 '国航': <gensim.models.keyedvectors.Vocab at 0x1d1afba4b38>,
 '里程': <gensim.models.keyedvectors.Vocab at 0x1d1afba4ba8>,
 '多久': <gensim.models.keyedvectors.Vocab at 0x1d1afba4c18>,
 '到': <gensim.models.keyedvectors.Vocab at 0x1d1afba4c50>,
 '不是': <gensim.models.keyedvectors.Vocab at 0x1d1afba4c88>,
 '有': <gensim.models.keyedvectors.Vocab at 0x1d1afba4d30>,
 '一年': <gensim.models.keyedvectors.Vocab at 0x1d1afba4e80>,
 '的': <gensim.models.keyedvectors.Vocab at 0x1d1afba4f98>,
 '分期': <gensim.models.keyedvectors.Vocab at 0x1d1afbbc160>,
 '吗': <gensim.models.keyedvectors.Vocab at 0x1d1afbbc278>,
 '？': <gensim.models.keyedvectors.Vocab at 0x1d1afbbc390>,
 '没有': <gensim.models.keyedvectors.Vocab at 0x1d1afbbc4a8>,
 '利息': <gensim.models.keyedvectors.Vocab at 0x1d1afbbc5c0>,
 '年底': <gensim.models.keyedvectors.Vocab at 0x1d1afbbc6d8>,
 '投资': <gensim.models.keyedvectors.Vocab at 

In [3]:
import pandas as pd
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

iris = datasets.load_iris()
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 2, 4], 'gamma':[0.125, 0.25, 0.5 ,1, 2, 4]}
svr = svm.SVC()
clf = GridSearchCV(svr, parameters, n_jobs=-1)
clf.fit(iris.data, iris.target)
cv_result = pd.DataFrame.from_dict(clf.cv_results_)
with open('cv_result.csv','w') as f:
    cv_result.to_csv(f)
    
print('The parameters of the best model are: ')
print(clf.best_params_)

y_pred = clf.predict(iris.data)
print(classification_report(y_true=iris.target, y_pred=y_pred))



The parameters of the best model are: 
{'C': 2, 'gamma': 0.125, 'kernel': 'linear'}
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        50
           1       1.00      0.94      0.97        50
           2       0.94      1.00      0.97        50

   micro avg       0.98      0.98      0.98       150
   macro avg       0.98      0.98      0.98       150
weighted avg       0.98      0.98      0.98       150





In [6]:
clf.best_params_

{'C': 2, 'gamma': 0.125, 'kernel': 'linear'}