# task2 TFIDF提取与分类

## 读取数据

In [1]:
import pandas as pd

In [3]:
data_dir = './data/'
train_data = pd.read_csv(data_dir + 'intent-recognition-train.csv', sep='\t', header=None)
test_data = pd.read_csv(data_dir + 'intent-recognition-test.csv', sep='\t', header=None)


## 提取特征  
使用sklearn中的TfidfVectorizer类提取训练集和测试集的特征

In [4]:
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
train_corpus = train_data[0]

vectorizer = TfidfVectorizer()
train_vec = vectorizer.fit_transform(train_corpus)
vectorizer.get_feature_names_out()

print(train_vec.shape)

(12100, 13085)


In [13]:
test_cropus = test_data[0]

# 使用相同的向量化器对象将测试集转换为TF-IDF特征向量

test_vec = vectorizer.transform(test_cropus)
vectorizer.get_feature_names_out()

print(test_vec.shape)

(3000, 13085)


TfidfVectorizer类中可以设置以下参数：  
- max_df: 用于过滤掉高频词项，在[0.0, 1.0]之间表示比例；  
- min_df: 用于过滤掉低频词项，在[0.0, 1.0]之间表示比例；  
- max_features: 用于限制提取特征的数量，默认为None。  
- ngram_range: 用于指定提取n元语法特征时n值范围，默认为(1, 1)，即只提取单个词项。  
- stop_words: 用于指定停用词列表，默认为None。  
- norm: 用于指定归一化方法，默认为’l2’范数。  
- use_idf: 是否使用逆文档频率计算权重，默认为True。  
- smooth_idf: 是否平滑逆文档频率计算，默认为True  

## 训练&预测  
使用KNN/LR/SVM等分类器对训练集进行训练，并对验证集和测试集进行预测，评估模型的性能。

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier



In [15]:
X_train = train_vec
y_train = train_data[1]
X_test = test_vec

In [19]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
score_lr = lr.score(X_train, y_train)
out_lr = lr.predict(X_test)
print('classifier score: {}'.format(score_lr))
print('predict score: {}'.format(out_lr))
# 0.17


logistic regression classifier score: 0.946198347107438
predict score: ['FilmTele-Play' 'FilmTele-Play' 'FilmTele-Play' ... 'FilmTele-Play'
 'FilmTele-Play' 'FilmTele-Play']


In [21]:
svc = SVC()
svc.fit(X_train, y_train)
score_svc = svc.score(X_train, y_train)
out_svc = svc.predict(X_test)
print('classifier score: {}'.format(score_svc))
print('predict score: {}'.format(out_svc))
# 0.218


logistic regression classifier score: 1.0
predict score: ['FilmTele-Play' 'FilmTele-Play' 'FilmTele-Play' ... 'FilmTele-Play'
 'FilmTele-Play' 'FilmTele-Play']


In [44]:
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)
score_knn = knn.score(X_train, y_train)
out_knn = knn.predict(X_test)
print('classifier score: {}'.format(score_knn))
print('predict score: {}'.format(out_knn))
# 0.090  5
# 0.075  12
# 0.168  3
# 0.185  2
# 0.186  1

logistic regression classifier score: 1.0
predict score: ['Video-Play' 'Video-Play' 'Video-Play' ... 'Video-Play' 'Video-Play'
 'Video-Play']


In [17]:
rfc = RandomForestClassifier()  
rfc.fit(X_train, y_train)
score_rfc = rfc.score(X_train, y_train)
out_rfc = rfc.predict(X_test)
print('classifier score: {}'.format(score_rfc))
print('predict score: {}'.format(out_rfc))
# 0.213

random forest classifier score: 1.0
predict score: ['FilmTele-Play' 'FilmTele-Play' 'FilmTele-Play' ... 'FilmTele-Play'
 'FilmTele-Play' 'FilmTele-Play']


In [49]:
voting = VotingClassifier(estimators=[('lr', lr), ('svc', svc), ('rfc', rfc)])  # ('knn', knn)
voting.fit(X_train, y_train)
score_voting = rfc.score(X_train, y_train)
out_voting = rfc.predict(X_test)
print('classifier score: {}'.format(score_voting))
print('predict score: {}'.format(out_voting))
# 0.213

classifier score: 1.0
predict score: ['FilmTele-Play' 'FilmTele-Play' 'FilmTele-Play' ... 'FilmTele-Play'
 'FilmTele-Play' 'FilmTele-Play']


In [50]:
out = list(out_knn)
data = [[i, out[i]] for i in range(len(out))]
# 将列表数据转换为DataFrame对象
df = pd.DataFrame(data, columns=['ID', 'Target'])

# 保存DataFrame对象到文件中
df.to_csv('./data/save.csv', index=False)