In [1]:
import pandas as pd
import jieba
import matplotlib.pyplot as plt

# 读取数据集，这里是直接联网读取，也可以通过下载文件，再读取
data_dir = 'https://mirror.coggle.club/dataset/coggle-competition/'
train_data = pd.read_csv(data_dir + 'intent-classify/train.csv', sep='\t', header=None)
test_data = pd.read_csv(data_dir + 'intent-classify/test.csv', sep='\t', header=None)

cn_stopwords = pd.read_csv('https://mirror.coggle.club/stopwords/baidu_stopwords.txt', header=None)[0].values

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    tokenizer=jieba.lcut,
    stop_words=list(cn_stopwords)
)
train_tfidf = tfidf.fit_transform(train_data[0])
test_tfidf = tfidf.transform(test_data[0])

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.694 seconds.
Prefix dict has been built successfully.


In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict

In [4]:
cv_pred = cross_val_predict(
    LogisticRegression(),
    train_tfidf, train_data[1]
)
print(classification_report(train_data[1], cv_pred))

                       precision    recall  f1-score   support

         Alarm-Update       0.98      0.93      0.96      1264
           Audio-Play       0.74      0.50      0.60       226
       Calendar-Query       0.99      0.95      0.97      1214
        FilmTele-Play       0.70      0.93      0.80      1355
HomeAppliance-Control       0.94      0.97      0.96      1215
           Music-Play       0.88      0.87      0.87      1304
                Other       0.39      0.07      0.11       214
         Radio-Listen       0.94      0.89      0.91      1285
       TVProgram-Play       0.72      0.45      0.55       240
         Travel-Query       0.92      0.96      0.94      1220
           Video-Play       0.90      0.87      0.89      1334
        Weather-Query       0.92      0.96      0.94      1229

             accuracy                           0.89     12100
            macro avg       0.84      0.78      0.79     12100
         weighted avg       0.89      0.89      0.89 

In [5]:
cv_pred = cross_val_predict(
    LinearSVC(),
    train_tfidf, train_data[1]
)
print(classification_report(train_data[1], cv_pred))

                       precision    recall  f1-score   support

         Alarm-Update       0.97      0.95      0.96      1264
           Audio-Play       0.64      0.71      0.67       226
       Calendar-Query       0.98      0.97      0.98      1214
        FilmTele-Play       0.81      0.89      0.85      1355
HomeAppliance-Control       0.97      0.98      0.98      1215
           Music-Play       0.90      0.89      0.89      1304
                Other       0.31      0.25      0.27       214
         Radio-Listen       0.94      0.90      0.92      1285
       TVProgram-Play       0.66      0.62      0.64       240
         Travel-Query       0.95      0.98      0.97      1220
           Video-Play       0.92      0.88      0.90      1334
        Weather-Query       0.96      0.97      0.97      1229

             accuracy                           0.91     12100
            macro avg       0.83      0.83      0.83     12100
         weighted avg       0.91      0.91      0.91 

In [6]:
cv_pred = cross_val_predict(
    KNeighborsClassifier(),
    train_tfidf, train_data[1]
)
print(classification_report(train_data[1], cv_pred))

                       precision    recall  f1-score   support

         Alarm-Update       0.84      0.92      0.88      1264
           Audio-Play       0.55      0.63      0.59       226
       Calendar-Query       0.80      0.96      0.88      1214
        FilmTele-Play       0.79      0.79      0.79      1355
HomeAppliance-Control       0.91      0.97      0.94      1215
           Music-Play       0.83      0.83      0.83      1304
                Other       0.20      0.25      0.22       214
         Radio-Listen       0.91      0.83      0.87      1285
       TVProgram-Play       0.55      0.39      0.46       240
         Travel-Query       0.95      0.90      0.93      1220
           Video-Play       0.87      0.73      0.79      1334
        Weather-Query       0.93      0.89      0.91      1229

             accuracy                           0.84     12100
            macro avg       0.76      0.76      0.76     12100
         weighted avg       0.85      0.84      0.84 

In [7]:
model = LinearSVC()
model.fit(train_tfidf, train_data[1])
pd.DataFrame({
    'ID':range(1, len(test_data) + 1),
    "Target":model.predict(test_tfidf)
}).to_csv('LinearSVC.csv', index=None)
# 可以提交到