In [None]:
import scipy
from sklearn import svm
from sklearn import metrics
import gensim
from gensim import models
from gensim.models import CoherenceModel
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer,\
                                                TfidfVectorizer

def train_predict_evaluate_model(classifier, train_features, train_labels,
                                 test_features, test_labels):
    """
    训练、预测、评估 模型
    :param classifier: 模型
    :param train_features: 训练集特征
    :param train_labels: 训练集label
    :param test_features: 测试集特征
    :param test_labels: 测试集label
    :return: 预测结果
    """
    classifier.fit(train_features, train_labels)
    predictions = classifier.predict(test_features)
    get_metrics(true_labels=test_labels, predicted_labels=predictions)
    return predictions

def get_metrics(true_labels, predicted_labels):
    """
    分别计算预测结果的准确率、精确率、召回率、F1值，直接打印出这些结果
    :param true_labels: 真实label
    :param predicted_labels: 预测结果
    :return:
    """
    print("accuracy:", np.round(metrics.accuracy_score(true_labels,
                                                       predicted_labels), 5))
    print("precision:", np.round(metrics.precision_score(
        true_labels, predicted_labels, average='weighted'), 5))
    print("recall:", np.round(metrics.recall_score(
        true_labels, predicted_labels, average='weighted'), 5))
    print("f1 score:", np.round(metrics.f1_score(
        true_labels, predicted_labels, average='weighted'), 5))

def xishu2choumi(corpus):
    data = []
    rows = []
    cols = []
    line_count = 0
    for line in corpus:  # lsi_corpus_total 是之前由gensim生成的lsi向量
        for elem in line:
            rows.append(line_count)
            cols.append(elem[0])
            data.append(elem[1])
        line_count += 1
    lsi_sparse_matrix = scipy.sparse.csr_matrix((data,(rows,cols))) # 稀疏向量
    corpus2matrix = lsi_sparse_matrix.toarray()  # 密集向量
    return corpus2matrix

In [None]:
import numpy as np
import pandas as pd

text=pd.read_csv("./test_news_with_title_2.csv")
#看text里有没有空值的项，因为split会发错误
drop_na=[]
for i in range(text.shape[0]):
    if pd.isnull(text.loc[i]).any():
        drop_na.append(i)

text=text.drop(drop_na)
text=text.reset_index(drop=True)

text.head()

In [None]:
for i in range(text.shape[0]):
    if "香港" in text["微博正文(有标题切词后)"][i]:
        print(i)
        print(text.loc[i])

In [None]:
text.shape[0]

# 1.BOW+SVM

In [None]:
def bow_(processed_docs):
    dictionary = gensim.corpora.Dictionary(processed_docs)
    dictionary.filter_extremes(no_below=15, no_above=0.8, keep_n=100000)
    bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
    return bow_corpus

text_splited=[i.split(" ") for i in text["微博正文(有标题切词后)"]]
bow_corpus=bow_(text_splited)
#print(bow_corpus)

bow_matrix=xishu2choumi(bow_corpus)
#print(bow_matrix)

train_matrix, test_matrix, y_train, y_test= train_test_split(bow_matrix, text["分类"],random_state=2,test_size=0.2)

clf = svm.SVC()
clf.fit(train_matrix,y_train)
predictions = clf.predict(test_matrix)
get_metrics(true_labels=y_test, predicted_labels=predictions)

# 2.TFIDF+SVM

In [None]:
def tfidf_(processed_docs):
    dictionary = gensim.corpora.Dictionary(processed_docs)
    dictionary.filter_extremes(no_below=15, no_above=0.8, keep_n=100000)
    bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
    tfidf = models.TfidfModel(bow_corpus,normalize=False)#-----------------这里改了，改成false
    corpus_tfidf = tfidf[bow_corpus]
    return corpus_tfidf

text_splited=[i.split(" ") for i in text["微博正文(有标题切词后)"]]
tfidf_corpus=tfidf_(text_splited)
#print(tfidf_corpus)

tfidf_matrix=xishu2choumi(tfidf_corpus)
#print(tfidf_matrix)

train_matrix, test_matrix, y_train, y_test= train_test_split(tfidf_matrix, text["分类"],random_state=2,test_size=0.2)

clf = svm.SVC()
clf.fit(train_matrix,y_train)
predictions = clf.predict(test_matrix)
get_metrics(true_labels=y_test, predicted_labels=predictions)

# 3. W-TFIDF+SVM

In [None]:
def tw_lda_get_tfidf(text,gamma):
    #1.用正文+标题作为dictionary，过滤极端值
    #--------------下面这里修改
    content_and_title=[i.split(" ") for i in text["微博正文(有标题切词后)"]]
    dictionary = gensim.corpora.Dictionary(content_and_title)
    #-------------下面这里修改
    dictionary.filter_extremes(no_below=15,no_above=0.8, keep_n=100000)
    
    #2.计算标题的tfidf
    #--------------下面这里修改
    processed_docs_title=[i.split(" ") for i in text["标题(切词后)"]]
    bow_corpus_title = [dictionary.doc2bow(doc) for doc in processed_docs_title]
    tfidf_title = models.TfidfModel(bow_corpus_title,normalize=False)
    corpus_tfidf_title = tfidf_title[bow_corpus_title]
#     print("corpus_tfidf_title",corpus_tfidf_title)
    
    #3.计算正文+标题的tfidf
    #--------------下面这里修改
    processed_docs_content=[i.split(" ") for i in text["微博正文(有标题切词后)"]]
    bow_corpus_content = [dictionary.doc2bow(doc) for doc in processed_docs_content]
    tfidf_content = models.TfidfModel(bow_corpus_content,normalize=False)
    corpus_tfidf_content = tfidf_content[bow_corpus_content]
#     print("corpus_tfidf_content",corpus_tfidf_content)
    
    #4.把标题和正文tfidf结合在一起
    new_tfidf=[]
    for i in range(len(corpus_tfidf_content)):#corpus_tfidf2[i]
        dict_2={one:two for one,two in corpus_tfidf_content[i]}
        dict_1={one:two for one,two in corpus_tfidf_title[i]}
        for j in dict_1.keys():
            if j in dict_2.keys():
                dict_2[j]=(1-gamma)*dict_2[j]+gamma*dict_1[j]
        new_tfidf_part=[(one,two) for one,two in sorted(dict_2.items(), key=lambda d: d[0],reverse=False)]
#         print("new_tfidf_part",new_tfidf_part)
#        new_tfidf.append(new_tfidf_part)
#        normed=gensim.models.tfidfmodel.smartirs_normalize(new_tfidf_part,"c")  #-------------这里改了把这个取消归一化了
#        new_tfidf.append(normed)
        new_tfidf.append(new_tfidf_part)
    
    #5.返回tfidf
    return new_tfidf,dictionary

In [None]:
def wtfidf_svm(text,gamma):
    a,b=tw_lda_get_tfidf(text,gamma=gamma)

    wtfidf_matrix=xishu2choumi(a)
    #print(bow_matrix)

    train_matrix, test_matrix, y_train, y_test= train_test_split(wtfidf_matrix, text["分类"],random_state=2,test_size=0.2)

    clf = svm.SVC()
    clf.fit(train_matrix,y_train)
    
    predictions = clf.predict(train_matrix)
    get_metrics(true_labels=y_train, predicted_labels=predictions)
    
    print("++++++++++")
    
    predictions = clf.predict(test_matrix)
    get_metrics(true_labels=y_test, predicted_labels=predictions)

In [None]:
for ga in [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]:
    print(ga)
    wtfidf_svm(text,ga)
    print("--------------")
    print(" ")

In [None]:
import matplotlib.pyplot as plt
from matplotlib import font_manager
my_font = font_manager.FontProperties(fname='C:/Windows/Fonts/simfang.ttf')


f1_wtfidf={0.0:0.85381,0.1:0.85668,0.2:0.85464,0.3:0.85481,0.4:0.85589,0.5:0.85706,0.6:0.85994,0.7:0.85884,0.8:0.85705,0.9:0.84846,1.0:0.84365}
f1_wtfidf_values=[i for i in f1_wtfidf.values()]
f1_wtfidf_keys=[i for i in f1_wtfidf.keys()]

f1_bow=[0.6674 for i in range(len(f1_wtfidf_keys))]
f1_tfidf=[0.7998 for i in range(len(f1_wtfidf_keys))]

plt.figure(figsize=(8,5),dpi=100)
plt.plot(f1_wtfidf_keys,f1_wtfidf_values,label="WTFIDF+SVM_F1_Score",marker=".",linewidth=2)
# plt.plot(f1_wtfidf_keys,f1_bow,label="BOW+SVM_F1_Score",linewidth=2)
# plt.plot(f1_wtfidf_keys,f1_tfidf,label="TFIDF+SVM_F1_Score",linewidth=2)


plt.ylim(0.8,0.9)
plt.xlabel('Gamma', fontproperties=my_font)
plt.ylabel('ModelF1Score', fontproperties=my_font)
plt.legend(prop=my_font)
plt.show()