In [1]:
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import f1_score

In [2]:
train_df = pd.read_csv('./dataset/train_set.csv', sep='\t', nrows=15000)

In [4]:
vectorizer = CountVectorizer(max_features=3000)
train_test = vectorizer.fit_transform(train_df['text'])

clf = RidgeClassifier()
clf.fit(train_test[:10000], train_df['label'].values[:10000])

val_pred = clf.predict(train_test[10000:])
print(f1_score(train_df['label'].values[10000:], val_pred, average='macro'))

0.7408571860769074


In [9]:
"""  
ngram_range: 选择的N元语法范围
max_features: 构造时取前max_features大的词频
f1_score: 综合查全查准的度量 
tfidf参数组合与f1_score如下：
ngram_range  max_features  f1_score
(1, 3)       3000          0.8719
(1, 4)       3000          0.8736
(2, 3)       4000          0.8520
由于这些函数实际上参数要更多，所以没有以上结果没有太多参考价值，仅做熟悉API之用。
"""

tfidf = TfidfVectorizer(ngram_range=(2, 3), max_features=4000)
train_test = tfidf.fit_transform(train_df['text'])

clf = RidgeClassifier()
clf.fit(train_test[:10000], train_df['label'].values[:10000])

val_pred = clf.predict(train_test[10000:])
print(f1_score(train_df['label'].values[10000:], val_pred, average='macro'))

0.852024733248656


In [10]:
""" 
SVM
"""

from sklearn.svm import SVC

clf = SVC()
clf.fit(train_test[:10000], train_df['label'].values[:10000])

val_pred = clf.predict(train_test[10000:])
print(f1_score(train_df['label'].values[10000:], val_pred, average='macro'))

0.8279217518521582


In [12]:
"""
LogisticRegression
"""

from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(max_iter=500)
clf.fit(train_test[:10000], train_df['label'].values[:10000])

val_pred = clf.predict(train_test[10000:])
print(f1_score(train_df['label'].values[10000:], val_pred, average='macro'))

0.8248649355399026


In [13]:
"""
Lightgbm 
"""
from lightgbm import LGBMClassifier

clf = LGBMClassifier()
clf.fit(train_test[:10000], train_df['label'].values[:10000])

val_pred = clf.predict(train_test[10000:])
print(f1_score(train_df['label'].values[10000:], val_pred, average='macro'))

0.849249098663161
