In [1]:
import pandas as pd
import numpy as np
import codecs # 编码类型转化
import jieba as jb

dpath = "./"
Train = pd.read_csv(dpath + "training.csv", header=None, encoding = "utf-8")
Train.columns = ["Label", "Text"]
Train.head()

Unnamed: 0,Label,Text
0,2,合晟资产是一家专注于股票、债券等二级市场投资，为合格投资者提供专业资产管理服务的企业。公司业...
1,2,公司的主营业务为向中小微企业、个体工商户、农户等客户提供贷款服务，自设立以来主营业务未发生过变化。
2,1,公司立足于商业地产服务，致力于为商业地产开发、销售、运营全产业链提供一整套增值服务，业务覆盖...
3,2,公司经工商管理部门核准的经营范围为“投资咨询、经济信息咨询，企业管理咨询，品牌推广策划，公共...
4,2,该公司的主营业务为在中国境内(港、澳、台除外)开展保险代理销售，依托于自身的产品研究能力和专...


In [2]:
# 读取停用词
def read_file(file_path):
    f = codecs.open(file_path, mode="r", encoding="unicode_escape")
    lines = []
    for line in f:
        line = line.rstrip("\n").rstrip("\r")
        lines.append(line)
    return lines

In [3]:
stopwords = read_file(dpath + "stopwords.txt")

In [4]:
# 分词
def segment_text(each_row):
    return " ".join([word for word in jb.lcut(each_row["Text"]) if word not in stopwords]) 

In [5]:
Train["text_segmentation"] = Train.apply(segment_text, axis=1)
Train.head()

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\MrGuo\AppData\Local\Temp\jieba.cache
Loading model cost 0.844 seconds.
Prefix dict has been built succesfully.


Unnamed: 0,Label,Text,text_segmentation
0,2,合晟资产是一家专注于股票、债券等二级市场投资，为合格投资者提供专业资产管理服务的企业。公司业...,合晟 资产 是 一家 专注 于 股票 、 债券 等 二级 市场 投资 ， 为 合格 投资者 ...
1,2,公司的主营业务为向中小微企业、个体工商户、农户等客户提供贷款服务，自设立以来主营业务未发生过变化。,公司 的 主营业务 为 向 中小 微 企业 、 个体 工商户 、 农户 等 客户 提供 贷款...
2,1,公司立足于商业地产服务，致力于为商业地产开发、销售、运营全产业链提供一整套增值服务，业务覆盖...,公司 立足于 商业地产 服务 ， 致力于 为 商业地产 开发 、 销售 、 运营 全 产业链...
3,2,公司经工商管理部门核准的经营范围为“投资咨询、经济信息咨询，企业管理咨询，品牌推广策划，公共...,公司 经 工商管理 部门 核准 的 经营范围 为 “ 投资 咨询 、 经济 信息 咨询 ， ...
4,2,该公司的主营业务为在中国境内(港、澳、台除外)开展保险代理销售，依托于自身的产品研究能力和专...,该 公司 的 主营业务 为 在 中国 境内 港 、 澳 、 台 除外 开展 保险代理 销售 ...


In [6]:
# TFIDF特征
# 先调用CountVectorizer和TfidfTrainsformer两种方法
from sklearn. feature_extraction.text import CountVectorizer, TfidfTransformer

vectorizer = CountVectorizer(min_df=10)
transformer = TfidfTransformer()
train_tfidf = transformer.fit_transform(vectorizer.fit_transform(Train["text_segmentation"])).toarray()

df_train_tfidf = pd.DataFrame(data=train_tfidf)
df_train_tfidf.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3594,3595,3596,3597,3598,3599,3600,3601,3602,3603
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
df_train_tfidf["Label"] = Train["Label"]
df_train_tfidf.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3595,3596,3597,3598,3599,3600,3601,3602,3603,Label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2


In [8]:
# 保存结果
df_train_tfidf.to_csv(dpath + "FE_train_tfidf.csv", index=False, header=True)