In [1]:
# 第一部分：
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

# 读取数据
df_train = pd.read_csv('../DataSets/DaGuan/train_set.csv')    # 若内存不足可添加参数:nrows=1000
df_test = pd.read_csv('../DataSets/DaGuan//test_set.csv')

df_train.head(10)

Unnamed: 0,id,article,word_seg,class
0,0,7368 1252069 365865 755561 1044285 129532 1053...,816903 597526 520477 1179558 1033823 758724 63...,14
1,1,581131 165432 7368 957317 1197553 570900 33659...,90540 816903 441039 816903 569138 816903 10343...,3
2,2,7368 87936 40494 490286 856005 641588 145611 1...,816903 1012629 957974 1033823 328210 947200 65...,12
3,3,299237 760651 299237 887082 159592 556634 7489...,563568 1239563 680125 780219 782805 1033823 19...,13
4,4,7368 7368 7368 865510 7368 396966 995243 37685...,816903 816903 816903 139132 816903 312320 1103...,12
5,5,7368 1160791 299237 1238054 569999 1044285 117...,816903 669476 21577 520477 1004165 4184 616471...,13
6,6,893673 7368 836872 674898 231468 856005 105964...,277781 816903 1098157 986174 1033823 780491 10...,1
7,7,1122654 125310 907560 1172361 979583 983951 12...,289186 640942 363388 585102 261174 1217680 520...,10
8,8,793790 599682 1223643 1030656 569999 178976 45...,1257015 966562 1054308 599826 811205 520477 28...,10
9,9,7368 1120647 360394 79747 1140778 472252 7368 ...,816903 266069 1226448 1276450 816903 769051 12...,19


In [2]:
#第二部分：将原始数据数字化为tf-idf特征，并将结果保存至本地

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
import time

t_start = time.time()


#1 数据预处理

print("1 数据预处理")
df_train.drop(columns='article', inplace=True)
df_test.drop(columns='article', inplace=True)

f_all = pd.concat(objs=[df_train, df_test], axis=0, sort=True)
y_train = (df_train['class'] - 1).values


#2 特征工程

print("2 特征工程")
vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=3, max_df=0.9, sublinear_tf=True)
vectorizer.fit(df_train['word_seg'])
x_train = vectorizer.transform(df_train['word_seg'])
x_test = vectorizer.transform(df_test['word_seg'])


#3 保存至本地

print("3 保存至本地")
data = (x_train, y_train, x_test)
fp = open('../DataSets/DaGuan/TF-IDF.pkl', 'wb')
pickle.dump(data, fp)
fp.close()

t_end = time.time()
print("已将原始数据数字化为tfidf特征，共耗时：{}min".format((t_end-t_start)/60))

1 数据预处理
2 特征工程
3 保存至本地
已将原始数据数字化为tfidf特征，共耗时：9.539328424135844min


In [None]:
# 练习：
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
    'The Old Man and the Sea, a novella written by Hemingway in Cuba in 1951.',
    'It was published in 1952.',
    "It is one of Hemingway's most famous works. ",
    'It tells the story of an old Cuban fisherman fighting a huge Marlin in the Gulf Stream far from shore. ',
]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())
 
print(X.shape)

['1951', '1952', 'an', 'and', 'by', 'cuba', 'cuban', 'famous', 'far', 'fighting', 'fisherman', 'from', 'gulf', 'hemingway', 'huge', 'in', 'is', 'it', 'man', 'marlin', 'most', 'novella', 'of', 'old', 'one', 'published', 'sea', 'shore', 'story', 'stream', 'tells', 'the', 'was', 'works', 'written']
(4, 35)


In [1]:
# 第三部分：word2vec
import pandas as pd
import gensim
import time
import pickle
import numpy as np
import csv,sys
vector_size = 100

maxInt = sys.maxsize
decrement = True
while decrement:
    # decrease the maxInt value by factor 10
    # as long as the OverflowError occurs.
    decrement = False
    try:
        csv.field_size_limit(maxInt)
    except OverflowError:
        maxInt = int(maxInt/10)
        decrement = True


# 0 辅助函数
def sentence2list(sentence):
    return sentence.strip().split()

start_time = time.time()

data_path = '../DataSets/DaGuan/'
feature_path = '../DataSets/DaGuan/feature_file/'
proba_path = '../DataSets/DaGuan/proba_file/'
model_path = '../DataSets/DaGuan/model_file/'
result_path ='../DataSets/DaGuan/result/'


# 1 准备训练数据
print("准备数据................ ")
df_train = pd.read_csv('../DataSets/DaGuan/train_set.csv')
df_test = pd.read_csv('../DataSets/DaGuan/test_set.csv')

sentences_train = list(df_train.loc[:, 'word_seg'].apply(sentence2list))
sentences_test = list(df_test.loc[:, 'word_seg'].apply(sentence2list))
sentences = sentences_train + sentences_test
print("准备数据完成! ")

# 2 训练
model = gensim.models.Word2Vec(sentences=sentences, size=vector_size, window=5, min_count=5, workers=8, sg=0, iter=5)
print("训练完成! ")


# 3 提取词汇表及vectors,并保存
print(" 保存训练结果........... ")
wv = model.wv
vocab_list = wv.index2word
word_idx_dict = {}
for idx, word in enumerate(vocab_list):
    word_idx_dict[word] = idx
    
vectors_arr = wv.vectors
vectors_arr = np.concatenate((np.zeros(vector_size)[np.newaxis, :], vectors_arr), axis=0)#第0位置的vector为'unk'的vector

f_wordidx = open(feature_path + 'word_seg_word_idx_dict.pkl', 'wb')
f_vectors = open(feature_path + 'word_seg_vectors_arr.pkl', 'wb')
pickle.dump(word_idx_dict, f_wordidx)
pickle.dump(vectors_arr, f_vectors)
f_wordidx.close()
f_vectors.close()
print("训练结果已保存到该目录下！ ")

end_time = time.time()
print("耗时：{}s ".format(end_time - start_time))

准备数据................ 
准备数据完成! 
训练完成! 
 保存训练结果........... 
训练结果已保存到该目录下！ 
耗时：4943.745208501816s 


In [1]:
# 第四部分：逻辑回归模型

import pickle
import pandas as pd
import time
from sklearn.externals import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import  train_test_split
time_start = time.time()

data_path = '../DataSets/DaGuan/'
feature_path = '../DataSets/DaGuan/feature_file/'
proba_path = '../DataSets/DaGuan/proba_file/'
model_path = '../DataSets/DaGuan/model_file/'
result_path ="../DataSets/DaGuan/result/"


#0 读取特征
print("0 读取特征")
data_fp = open(feature_path + "data_w_tfidf.pkl", 'rb')
x_train, y_train, x_test = pickle.load(data_fp)

xTrain, xTest, yTrain, yTest = train_test_split(x_train, y_train, test_size=0.30, random_state=531)


#1 模型训练
print("模型训练")
lr = LogisticRegression(C=120,dual=True)
lr.fit(x_train,y_train)


#2 保存模型

print('2 保存模型')
joblib.dump(lr, model_path + "LR(120)_data_w_tfidf.m")


#3 预测结果 
print("预测结果")
y_test = lr.predict(x_test)


#4 保存结果 
print("保存结果")
y_test = [i+1 for i in list(y_test)]
df_result = pd.DataFrame({'id':range(len(y_test)),'class':y_test})

df_result.to_csv(result_path + 'LR(c120)_data_w_tfidf.csv',index=False)

time_end = time.time()
print('共耗时：{:.2f}min'.format((time_start-time_end)/60))

0 读取特征
模型训练




2 保存模型
预测结果
保存结果
共耗时：-22.51min


In [1]:
# SVM：

import pickle
import pandas as pd
import time
from sklearn import svm
from sklearn.externals import joblib
from sklearn.metrics import accuracy_score
from sklearn.model_selection import  train_test_split
time_start = time.time()

data_path = '../DataSets/DaGuan/'
feature_path = '../DataSets/DaGuan/feature_file/'
proba_path = '../DataSets/DaGuan/proba_file/'
model_path = '../DataSets/DaGuan/model_file/'
result_path ="../DataSets/DaGuan/result/"


#0 读取特征
print("0 读取特征")
data_fp = open(feature_path  + "data_w_tfidf.pkl", 'rb')
x_train, y_train, x_test = pickle.load(data_fp)

xTrain, xTest, yTrain, yTest = train_test_split(x_train, y_train, test_size=0.30, random_state=531)


#1 模型训练
print("模型训练")
# clf = joblib.load('linearsvm_model_Tfid.1.m')
clf = svm.LinearSVC(C=5,dual=False)
clf.fit(x_train,y_train)


#2 保存模型
print('2 保存模型')
joblib.dump(clf, model_path + "SVM(c5)_data_w_tfidf.m")


#3 预测结果 

print("预测结果")
y_test = clf.predict(x_test)


#4 保存结果 
print("保存结果")
y_test = [i+1 for i in list(y_test)]
df_result = pd.DataFrame({'id':range(len(y_test)),'class':y_test})
df_result.to_csv(result_path + 'SVM(c5)_data_w_tfidf.csv',index=False)

time_end = time.time()
print('共耗时：{:.2f}min'.format((time_start-time_end)/60))

0 读取特征
模型训练
2 保存模型
预测结果
保存结果
共耗时：-21.41min


In [None]:
# 第五部分：lightGBM模型： 24小时才训练了21次，设定800次，至少要在电脑上面运行一个月。

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import time
import pickle
import lightgbm as LGB
from sklearn.externals import joblib

t_start = time.time()
data_path = '../DataSets/DaGuan/'
feature_path = '../DataSets/DaGuan/feature_file/'
proba_path = '../DataSets/DaGuan/proba_file/'
model_path = '../DataSets/DaGuan/model_file/'
result_path ="../DataSets/DaGuan/result/"


# 0 自定义验证集的评价函数
print("0 自定义验证集的评价函数")
def f1_score_vali(preds, data_vali):
    labels = data_vali.get_label()
    preds = np.argmax(preds.reshape(19, -1), axis=0)
    score_vali = f1_score(y_true=labels, y_pred=preds, average='macro')
    return 'f1_score', score_vali, True


#1 读取数据,并转换到LGB的标准数据格式
print("1 读取数据,并转换到LGB的标准数据格式")
data_fp = open(feature_path + 'data_w_tfidf.pkl' , 'rb')
x_train, y_train, x_test = pickle.load(data_fp)
data_fp.close()

#2 划分训练集和验证集，验证集比例为test_size

print("划分训练集和验证集，验证集比例为test_size")
x_train, x_vali, y_train, y_vali = train_test_split(x_train, y_train, test_size=0.1, random_state=0)
d_train = LGB.Dataset(data=x_train, label=y_train)
d_vali = LGB.Dataset(data=x_vali, label=y_vali)


#3 训练LGB分类器
print("3 训练LGB分类器")
params = {
    'boosting': 'gbdt',
    'application': 'multiclassova',
    'num_class': 19,
    'learning_rate': 0.1,
    'num_leaves': 31,
    'max_depth': -1,
    'lambda_l1': 0,
    'lambda_l2': 0.5,
    'bagging_fraction': 1.0,

}

bst = LGB.train(params, d_train, num_boost_round=10, valid_sets=d_vali, feval=f1_score_vali,
                early_stopping_rounds=None,
                verbose_eval=True)                   # num_boost_round=800 ： 迭代800次，这里由于笔记本上跑，点到为止

joblib.dump(bst, model_path + "LGB_data_w_tfidf.m")


#4 对测试集进行预测;将预测结果转换为官方标准格式；并将结果保存至本地
print("4 对测试集进行预测;将预测结果转换为官方标准格式；并将结果保存至本地")
y_proba = bst.predict(x_test)
y_test = np.argmax(y_proba, axis=1) + 1

df_result = pd.DataFrame(data={'id': range(5000), 'class': y_test.tolist()})
df_proba = pd.DataFrame(data={'id': range(5000), 'proba': y_proba.tolist()})

df_result.to_csv(result_path  + 'LGB_data_w_tfidf_result.csv', index=False)
df_proba.to_csv(result_path + 'LGB_data_w_tfidf_proba.csv', index=False)
t_end = time.time()
print("训练结束，耗时:{}min".format((t_end - t_start) / 60))

0 自定义验证集的评价函数
1 读取数据,并转换到LGB的标准数据格式
划分训练集和验证集，验证集比例为test_size
3 训练LGB分类器
[1]	valid_0's multi_logloss: 2.18267	valid_0's f1_score: 0.587845
[2]	valid_0's multi_logloss: 1.9783	valid_0's f1_score: 0.624412
[3]	valid_0's multi_logloss: 1.83931	valid_0's f1_score: 0.643124
[4]	valid_0's multi_logloss: 1.73331	valid_0's f1_score: 0.657061
[5]	valid_0's multi_logloss: 1.65183	valid_0's f1_score: 0.661236
