In [2]:
import jieba
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pprint
import warnings
warnings.simplefilter('ignore')

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict

In [3]:
data_dir = 'https://mirror.coggle.club/dataset/coggle-competition/'
train_data = pd.read_csv(data_dir + 'intent-classify/train.csv', sep='\t', header=None)
test_data = pd.read_csv(data_dir + 'intent-classify/test.csv', sep='\t', header=None)
cn_stopwords = pd.read_csv('https://mirror.coggle.club/stopwords/baidu_stopwords.txt', header=None)[0].values

In [4]:
# 读取中文停用词
cn_stopwords = ' '.join(pd.read_csv('https://mirror.coggle.club/stopwords/baidu_stopwords.txt', header=None)[0])

In [5]:

corpus = train_data[0]

In [6]:
texts = []

for i in range(len(corpus)):
    content = ''.join(corpus.iloc[i])
    words = jieba.lcut(content)
    texts.append(words)    

# Count word frequencies
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

processed_corpus = [[token for token in text if frequency[token] > 1] for text in texts]
pprint.pprint(processed_corpus)

Building prefix dict from the default dictionary ...
Loading model from cache D:\Temp\jieba.cache
Loading model cost 0.629 seconds.
Prefix dict has been built successfully.


[['还有', '双鸭山', '到', '的', '汽车票', '吗', '13', '号', '的'],
 ['从', '这里', '怎么', '回家'],
 ['随便', '播放', '一首', '专辑', '里', '的', '的', '歌'],
 ['给', '看', '一下', '嘛'],
 ['我', '想', '看', '挑战', '打', '团', '竞', '的', '游戏', '视频'],
 ['我', '想', '看', '和平', '精英', '上', '的', '游戏', '视频'],
 ['2019', '年', '古装', '爱情', '电视剧', '小女', '花不弃', '的', '花絮', '播放', '一下'],
 ['找', '一个', '年', '的', '推理', '剧给', '我', '看', '一会', '呢'],
 ['去', '深圳', '都', '经过', '那些', '地方', '啊'],
 ['给', '我', '转播', '今天', '的', '女子双打', '乒乓球', '比赛', '现场'],
 ['单曲', '循环', '一首', '催眠', '的', '歌', '吧'],
 ['你', '能', '播放', '一下', '曹盾', '导演', '的', '古装', '悬疑剧', '呢'],
 ['河南', '新闻广播', '找', '一下', '啊', '是', '新闻台'],
 ['交通', '类', '的', '武汉', '交通', '广播电台', '来', '一个', '吧'],
 ['播放', '香港电台', '的', '王菲', '的', '经典音乐'],
 ['给', '我', '找', '一个', '魔兽', '世界', '的', '比赛', '视频'],
 ['播放', '钢琴曲', '命运', '交响曲'],
 ['早上', '七点', '开', '空调'],
 ['查询', '北京', '飞', '桂林', '的', '飞机', '是否', '已经', '起飞', '了'],
 ['播放', '中央', '电台', '里', '的', '都市', '之声'],
 ['客厅', '空调', '风速', '低', '一些', '低', '一些'],
 ['海南', '今天', '几级

In [12]:
from gensim.models import Word2Vec

def transform2vector(corpus, vector_size):
    model = Word2Vec(corpus, vector_size=vector_size, window=5, min_count=1, workers=4)

    # 训练模型
    model.train(corpus, total_examples=len(corpus), epochs=10)
    # 保存模型
    model.save("word2vec_model.model")
    
    array = []
    for i, sentence in enumerate(corpus):
        vector = [model.wv[word] for word in sentence]
        vector_mean = np.mean(vector, axis=0)
        array.append(vector_mean)
    train_vector = pd.DataFrame(array)
    
    return train_vector

In [13]:
train_vector = transform2vector(processed_corpus, vector_size=200)
train_vector['label'] = train_data[1]

print("*" * 10 + "KNN" + "*" * 10)

cv_pred = cross_val_predict(
    KNeighborsClassifier(),
    train_vector.iloc[:, :-1], train_vector['label']
)
pprint.pprint(classification_report(train_data[1], cv_pred))

print("*" * 10 + "SVC" + "*" * 10)

cv_pred = cross_val_predict(
    LinearSVC(),
    train_vector.iloc[:, :-1], train_vector['label']
)
pprint.pprint(classification_report(train_data[1], cv_pred))

print("*" * 10 + "LogisticRegression" + "*" * 10)

cv_pred = cross_val_predict(
    LogisticRegression(),
    train_vector.iloc[:, :-1], train_vector['label']
)
pprint.pprint(classification_report(train_data[1], cv_pred))

**********KNN**********
('                       precision    recall  f1-score   support\n'
 '\n'
 '         Alarm-Update       0.95      0.95      0.95      1264\n'
 '           Audio-Play       0.40      0.55      0.46       226\n'
 '       Calendar-Query       0.97      0.96      0.96      1214\n'
 '        FilmTele-Play       0.69      0.75      0.72      1355\n'
 'HomeAppliance-Control       0.94      0.94      0.94      1215\n'
 '           Music-Play       0.80      0.78      0.79      1304\n'
 '                Other       0.22      0.19      0.20       214\n'
 '         Radio-Listen       0.93      0.84      0.88      1285\n'
 '       TVProgram-Play       0.55      0.56      0.55       240\n'
 '         Travel-Query       0.92      0.95      0.93      1220\n'
 '           Video-Play       0.83      0.79      0.81      1334\n'
 '        Weather-Query       0.91      0.90      0.91      1229\n'
 '\n'
 '             accuracy                           0.85     12100\n'
 '          

KeyboardInterrupt: 