## Linear Regression Model

(Based on bag_of_words representation)

In [1]:
import re
import jieba
import pandas as pd
import numpy as np
import json
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression

### Stop words list

In [2]:
# global variables initialisation
with open('../stop_words/中文停用词表.txt', 'r', encoding='UTF-8-sig') as f:
    stop_words = [ word.strip().replace('\n', '') for word in f.readlines()]
symbols = stop_words[0:26]
print('e.g.', stop_words[23:33])

e.g. ['？', '.', '%', '一', '一些', '一何', '一切', '一则', '一方面', '一旦']


### Data

In [None]:
# Single input
data_path = '请输入数据路径'  # e.g. data/record1.xls
all_data = pd.read_excel(data_path)
all_data.info()

In [3]:
# Multiple inputs
# 如果能把所有的record数据都集成一个excel文件，就可以用上面的代码
all_data = pd.DataFrame()
for i in range(17):
    i += 1
    path = "../data/record" + str(i) + ".xls"
    all_data = all_data.append(pd.read_excel(path), sort=False)
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 58611 entries, 0 to 693
Data columns (total 13 columns):
文档号码                                58611 non-null int64
证券代码（请务必使用text格式 以保留代码中的0）          30412 non-null object
证券简称                                41984 non-null object
投资者关系活动类别                           41287 non-null object
参与机构数量（家）（0代表0个单位，空白代表无相关信息）        41992 non-null object
参与机构（请用分号隔开，并统一使用证券公司名称（申银万国 等））    30187 non-null object
参与投资者数量                             41987 non-null object
日期（格式统一为xx/xx/xxxx（日月年））            41992 non-null object
接待人员数量                              41992 non-null object
投资者关系活动主要内容介绍                       41986 non-null object
Unnamed: 10                         16619 non-null object
证券代码（请务必使用text格式以保留代码中的0）           11579 non-null object
参与机构（请用分号隔开；并统一使用证券公司名称（申银万国等））     11580 non-null object
dtypes: int64(1), object(12)
memory usage: 6.3+ MB


In [4]:
# 取出想要的数据并且去除空行
wanted_columns = ['文档号码','投资者关系活动主要内容介绍', '证券代码（请务必使用text格式 以保留代码中的0）', '日期（格式统一为xx/xx/xxxx（日月年））']
data = all_data[wanted_columns]
print('Before cleaning:')
print(data.shape)
data = data.dropna() # drop rows with null values
print('After cleaning:')
print(data.shape)

Before cleaning:
(58611, 4)
After cleaning:
(30406, 4)


### Merge Data by Cusip and Date

In [5]:
data = data.rename(columns={"文档号码": "ID", "投资者关系活动主要内容介绍": "content", 
                     "证券代码（请务必使用text格式 以保留代码中的0）": "cusip",
                     "日期（格式统一为xx/xx/xxxx（日月年））": "date"})
data.head(10)

Unnamed: 0,ID,content,cusip,date
0,1200573239,一、问题回答。\n1、非公开发行的进展情况如何？\n答：目前中国证监会正在审核公司非公开发行...,2635,2015-01-23 00:00:00
1,1200573241,会议主要关注BE部分。\n\n问题一：BE最近有何变化？\n回答：公司目前业务进展顺利，业务...,2594,2015-01-22 00:00:00
2,1200573242,会议主要关注BYD部分。\n\n问题一：公司新能源汽车的目前的进展情况？\n回答：公司将新能...,2594,2015-01-23 00:00:00
3,1200573245,问：请您谈谈公司2015年的收入季节性是否还会存在？\n答：公司2013、2014年的第三季...,2439,2015-01-22 00:00:00
5,1200573285,1、公司目前的出口情况？\n答：近几年来，公司加强了海外出口的投入力度，组建了专门的国际贸易...,2690,2015-01-22 00:00:00
8,1200573318,国联证券研究员于2015年01月22日到公司化工新材料产业园区进行现场调研座谈，主要内容如下...,830,2015-01-22 00:00:00
9,1200573672,1.公司机床的优势？\n公司具有规模优势，公司是国内机床产品种类最为齐全的机床类企业。具备提...,410,2015-01-22 00:00:00
10,1200573678,一、签署《调研来访承诺书》；\n二、回答调研提问，主要问答简述如下：\n1、问：公司规划建新...,49,2015-01-23 00:00:00
11,1200573726,一、交流内容\n董事会秘书：欢迎英大基金来访本公司。投资者和研究人员来访本公司须遵循深圳交易...,2579,2015-01-23 00:00:00
12,1200573767,1、本次收购方式、交易价格及收购资金来源？\n答：公司全资子公司翼鹏投资、公司及公司实际控制...,2324,2015-01-23 00:00:00


In [6]:
ref_data = pd.read_excel('ref_data.xlsx')
ref_data.head(10)

Unnamed: 0,year,cdt,count,cusip,dtd,edf,Rating,rating_edf,dif,dividend_,logassets,logliquidity,PM,O/S,L/EV,L/E,EquVol,E/TA,ROA
0,2007,200701,0,000002.SZ,2.534177,0.00563559,,,,14.185946,,22.063253,,,,,3.792838,,
1,2007,200702,0,000002.SZ,4.025581,2.8417e-05,,,,14.185946,,22.607333,,,,,3.817393,,
2,2007,200703,0,000002.SZ,4.839769,6.5e-07,,,,14.185946,,22.423461,,,,,3.933348,,
3,2007,200704,0,000002.SZ,5.073994,1.95e-07,,,,14.185946,25.32938,22.20479,,,0.348161,1.95094,3.992363,0.338875,
4,2008,200801,0,000002.SZ,4.257932,1.0316e-05,,,,13.630978,25.32938,22.215737,,,0.394585,1.95094,4.033789,0.338875,
5,2008,200802,0,000002.SZ,3.086551,0.00101247,,,,13.630978,25.433708,22.30769,,,0.762528,2.101394,4.125848,0.322436,0.019518
6,2008,200803,0,000002.SZ,1.174012,0.1201952,AA+,0.0038,0.116395,13.630978,25.433708,22.513304,,,1.081443,2.101394,4.174401,0.322436,0.019518
7,2008,200804,0,000002.SZ,-0.039661,0.5158184,AA+,0.0038,0.512018,13.630978,25.504375,23.226045,,,1.159223,2.071639,4.19704,0.325559,0.036777
8,2009,200901,0,000002.SZ,0.549717,0.2912567,AA+,0.0,0.291257,14.440949,25.504375,23.074895,,,0.898053,2.071639,4.168738,0.325559,0.036777
9,2009,200902,0,000002.SZ,1.943572,0.0259736,AA+,0.0,0.025974,14.440949,25.547732,23.153981,,,0.606388,1.978594,4.153947,0.335729,0.020712


In [7]:
from datetime import datetime

In [8]:
data = data.reset_index(drop=True)

有部分的date格式不对，进行提前清理

In [9]:
for i, dp in data.iterrows():
    try:
        str(dp['date'].strftime("%Y%m"))
    except:
        data.loc[i] = None

In [10]:
data = data.dropna().reset_index(drop=True)

In [11]:
data['date'] = data['date'].apply(lambda x: x.strftime("%Y%m"))

In [12]:
data['date'] = data['date'].apply(lambda x: str(x))

In [13]:
ref_data['cdt'] = ref_data['cdt'].apply(lambda x: str(x))

In [14]:
np.sort(data['date'].unique())

array(['200102', '200103', '200105', '200108', '200201', '200310',
       '201104', '201105', '201110', '201201', '201202', '201203',
       '201204', '201205', '201206', '201207', '201208', '201209',
       '201210', '201211', '201212', '201301', '201302', '201303',
       '201304', '201305', '201306', '201307', '201308', '201309',
       '201310', '201311', '201312', '201401', '201402', '201403',
       '201404', '201405', '201406', '201407', '201408', '201409',
       '201410', '201411', '201412', '201501', '201502', '201503',
       '201504', '201505', '201506', '201507', '201508', '201509',
       '201510', '201511', '201512', '201601', '201602', '201603',
       '201604', '201605', '201606', '201607', '201608', '201609',
       '201610', '201611', '201612', '201701', '201702', '201703',
       '201704', '201705', '201706', '201707', '201708', '201709',
       '201710', '201711', '201712', '201801', '201802'], dtype=object)

In [15]:
np.sort(ref_data['cdt'].unique())

array(['200701', '200702', '200703', '200704', '200801', '200802',
       '200803', '200804', '200901', '200902', '200903', '200904',
       '201001', '201002', '201003', '201004', '201101', '201102',
       '201103', '201104', '201201', '201202', '201203', '201204',
       '201301', '201302', '201303', '201304', '201401', '201402',
       '201403', '201404', '201501', '201502', '201503', '201504',
       '201601', '201602', '201603', '201604', '201701', '201702',
       '201703', '201704', '201801', '201802', '201803', '201804'],
      dtype=object)

选取2012第一季度到2018第一季度的data并转换日期格式

In [29]:
# year-month to year-season
def yyyymm_to_yyyyss(yyyymm):
    
    assert len(yyyymm) == 6
    yyyy = yyyymm[:4]
    mm = yyyymm[-2:]
    if mm == '01' or mm == '02' or mm == '03':
        return yyyy + '01'
    elif mm == '04' or mm == '05' or mm == '06':
        return yyyy + '02'
    elif mm == '07' or mm == '08' or mm == '09':
        return yyyy + '03'
    elif mm == '10' or mm == '11' or mm == '12':
        return yyyy + '04'
    else:
        print('wrong month: {}'.format(mm))

yyyymm_to_yyyyss('201513')    

wrong month: 13


In [32]:
data['date'] = data['date'].apply(lambda x: yyyymm_to_yyyyss(x))

In [35]:
np.sort(data['date'].unique())

array(['200101', '200102', '200103', '200201', '200304', '201102',
       '201104', '201201', '201202', '201203', '201204', '201301',
       '201302', '201303', '201304', '201401', '201402', '201403',
       '201404', '201501', '201502', '201503', '201504', '201601',
       '201602', '201603', '201604', '201701', '201702', '201703',
       '201704', '201801'], dtype=object)

In [40]:
def keep_date(date):
    yyyy = int(date[:4])
    if 2012 <= yyyy and yyyy <= 2018:
        return date
    else:
        return None
    
np.sort(data['date'].apply(lambda x: keep_date(x)).dropna().unique())

array(['201201', '201202', '201203', '201204', '201301', '201302',
       '201303', '201304', '201401', '201402', '201403', '201404',
       '201501', '201502', '201503', '201504', '201601', '201602',
       '201603', '201604', '201701', '201702', '201703', '201704',
       '201801'], dtype=object)

In [41]:
data['date'] = data['date'].apply(lambda x: keep_date(x))

In [45]:
data = data.dropna()

至此data里仅保留2012第一季度到2018第一季度(2018最高只到二月份）的数据

In [63]:
len(np.unique(ref_data['cusip']))

298

In [59]:
def check(cusip):
    if len(cusip) == 6:
        return cusip
    else:
        return None

In [62]:
len(data['cusip'].apply(lambda x: check(x)).dropna().unique())

1582

In [68]:
data = data.rename(columns={"date": "yyyyss"})

In [None]:
data.to_excel('all_records_2012_2018.xlsx', index=False)

### Preprocessing

In [5]:
# 数据预处理所需要的所有方法

# clean the document, only Chinese characters, Numbers and Punctuations are left.
def clean(doc):
    chi = r'([\u4E00-\u9FA5]|[0-9]|[“”、。《》！，：；？\.%])'
    pa = re.compile(chi)
    return "".join(re.findall(pa, doc))

# sentence segmentation
def sent_seg(cleaned_doc):
    sent_pa = re.compile(r'.+?[？。！]')
    return re.findall(sent_pa, cleaned_doc)

def pure_sent(sent):
    cleaned_sent_pa = re.compile(r'([\u4E00-\u9FA5])')
    return ''.join(re.findall(cleaned_sent_pa, sent))
        
# Size of a doc is defined as the total number of valid Chinese characters
def raw_process(doc):
    cleaned_doc = clean(doc)
    sents = sent_seg(cleaned_doc)
    if not cleaned_doc or not len(sents):
        return {
            'sents': [],
            'size': 0,
            'avg_sent_len' : 0
        }
    else:
        total_length = sum([len(pure_sent(sent)) for sent in sents])
        avg_sent_length = total_length / len(sents)
        return {
            'sents': sents,
            'size' : total_length,
            'avg_sent_len' : avg_sent_length 
        }

# generate frequency distribution for each document, vital step for bag_of_words representation
def gen_freq_dist(doc):
    stat = raw_process(doc)
    sents = stat['sents']
    freq_dist = dict()
    pa = re.compile(r'([$0123456789?_“”、。《》！，：；？\.%])')
    for sent in sents:
        # calculate sent length after
        words = jieba.cut(sent, cut_all=False, HMM=True)
        for word in words:
            # ignore all the stop words
            if (not word in stop_words) and (not re.findall(pa, word)):
                freq_dist.setdefault(word, 0)
                freq_dist[word] += 1
    return { 'freq_dist' : freq_dist, 
             'size' : stat['size'],
             'avg_sent_len' : stat['avg_sent_len'],
             'n_sents' : len(sents)
           }

### Full frequencey distribution

1. 如果已经对当前的data完整运行过readability，请将readability文件夹里面的all_freq_dist.json复制到regression文件夹里，从而复用数据，并使用下数第二个cell进行读取。
2. 如果尚未生成当前data的完整freq_dist，请跑一次下数第一个cell（One-time-block）进行生成。

In [None]:
# One-time block 
# 建立一个完整的 frequency distribution，推荐只跑一次将数据储存以复用
def init_all_freq_dist():
    count = 0
    all_freq_dist = dict()
    for index, d in data.iterrows():
        print('[' + str(count) + '] Processing document ' + str(d['文档号码']) + '...')
        fd = gen_freq_dist(d[1])['freq_dist']
        for k in fd.keys():
            all_freq_dist.setdefault(k, 0)
            all_freq_dist[k] += fd[k]
        count += 1
    return all_freq_dist

all_freq_dist = init_all_freq_dist()
with open('all_freq_dist.json', 'w+', encoding='UTF-8-sig') as f:
    json.dump(all_freq_dist, f)

In [7]:
# 如果前一个cell已经完整跑完一次，只需要跑这个cell就能拿到完整的 frequency distribution
with open('all_freq_dist.json', 'r', encoding='UTF-8-sig') as f:
    all_freq_dist = json.load(f)
all_freq_dist_df = pd.DataFrame.from_dict(all_freq_dist, orient='index', columns=['freq'])
print('Most frequent word is: ' + str(np.argmax(all_freq_dist_df['freq'])))
all_freq_dist_df.describe()

Most frequent word is: 公司


Unnamed: 0,freq
count,128918.0
mean,137.010472
std,2841.91742
min,1.0
25%,1.0
50%,3.0
75%,13.0
max,770176.0


### Bag of words construction

有了完整的freq_dist以后就能把每个document都转换成bag_of_words形式。

In [8]:
# 先把frequency低于3的全部去掉
low_freq_words = [word for word in all_freq_dist.keys() if all_freq_dist[word] <= 3]
for lw in low_freq_words:
    del all_freq_dist[lw]
print('Remaining number of words:', len(all_freq_dist.keys()))

Remaining number of words: 59536


In [9]:
vec = DictVectorizer()
all_bow = vec.fit_transform(all_freq_dist).toarray()
print('e.g.', vec.get_feature_names()[12000:12010])

e.g. ['化已', '化建', '化强', '化成', '化整为零', '化新', '化是', '化机', '化机制', '化来']


In [10]:
def gen_bag_of_words(doc):
    global vec
    return vec.transform(gen_freq_dist(doc)['freq_dist']).toarray()

def all_bag_of_words(col_name_for_docs, limited=False):
    global vec, data
    dimension = len(vec.get_feature_names())
    count = 0
    init = True
    X = []
    for index, data_point in data.iterrows():
        # 文档号码如果不存在将以下print注释，或者替换成另外指明数据的列
        print('[' + str(count) + '] Transforming document ' + str(data_point['文档号码']) + '...')
        # Initialise X with first document
        if init:
            X = gen_bag_of_words(data_point[col_name_for_docs])
            init = False
        else:
            X = np.vstack((X, gen_bag_of_words(data_point[col_name_for_docs])))
        count += 1
        
        if limited and count == 1000:
            # For test use, just use the first 1000 rows
            break
    
    return X

# returning the coeffcients of the linear regression model after fitting X and y
def lr_coeffs(X, y):
    global vec
    features = list(vec.get_feature_names())
    reg = LinearRegression().fit(X, y)
    coeffs = list(reg.coef_)
    result = pd.DataFrame(columns=['Feature', 'Coefficients'])
    result['Feature'] = features
    result['Coefficients'] = coeffs
    return result.sort_values(by=['Coefficients'], ascending=False)

In [None]:
# Simply for testing
test_X = all_bag_of_words('投资者关系活动主要内容介绍', limited=True)
test_y = np.dot(test_X, np.array([1, 2] * int(59536/2))) + 3
dummy = pd.DataFrame()
dummy['Y'] = list(test_y)
dummy['文档号码'] = list(data['文档号码'][:1000])
dummy.to_excel('dummy_Y.xlsx')
test_result = lr_coeffs(test_X, test_y)

In [None]:
X = all_bag_of_words('投资者关系活动主要内容介绍', limited=False)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\lawhy\AppData\Local\Temp\jieba.cache


[0] Transforming document 1200573239...


Loading model cost 0.700 seconds.
Prefix dict has been built successfully.


[1] Transforming document 1200573241...
[2] Transforming document 1200573242...
[3] Transforming document 1200573245...
[4] Transforming document 1200573285...
[5] Transforming document 1200573318...
[6] Transforming document 1200573672...
[7] Transforming document 1200573678...
[8] Transforming document 1200573726...
[9] Transforming document 1200573767...
[10] Transforming document 1200573841...
[11] Transforming document 1200573903...
[12] Transforming document 1200574029...
[13] Transforming document 1200574051...
[14] Transforming document 1200574620...
[15] Transforming document 1200574888...
[16] Transforming document 1200576676...
[17] Transforming document 1200579053...
[18] Transforming document 1200579060...
[19] Transforming document 1200579151...
[20] Transforming document 1200579217...
[21] Transforming document 1200579240...
[22] Transforming document 1200579318...
[23] Transforming document 1200579405...
[24] Transforming document 1200579480...
[25] Transforming documen

[199] Transforming document 1200612342...
[200] Transforming document 1200612386...
[201] Transforming document 1200612450...
[202] Transforming document 1200612620...
[203] Transforming document 1200612633...
[204] Transforming document 1200612973...
[205] Transforming document 1200612974...
[206] Transforming document 1200613037...
[207] Transforming document 1200613101...
[208] Transforming document 1200614421...
[209] Transforming document 1200615299...
[210] Transforming document 1200615329...
[211] Transforming document 1200615400...
[212] Transforming document 1200615421...
[213] Transforming document 1200615431...
[214] Transforming document 1200615443...
[215] Transforming document 1200615494...
[216] Transforming document 1200615513...
[217] Transforming document 1200615515...
[218] Transforming document 1200615643...
[219] Transforming document 1200615649...
[220] Transforming document 1200615666...
[221] Transforming document 1200615668...
[222] Transforming document 120061

[396] Transforming document 1200676294...
[397] Transforming document 1200676309...
[398] Transforming document 1200677078...
[399] Transforming document 1200679252...
[400] Transforming document 1200681727...
[401] Transforming document 1200681835...
[402] Transforming document 1200681874...
[403] Transforming document 1200681888...
[404] Transforming document 1200681956...
[405] Transforming document 1200682019...
[406] Transforming document 1200682140...
[407] Transforming document 1200682168...
[408] Transforming document 1200682181...
[409] Transforming document 1200682205...
[410] Transforming document 1200682222...
[411] Transforming document 1200682224...
[412] Transforming document 1200682243...
[413] Transforming document 1200682564...
[414] Transforming document 1200682565...
[415] Transforming document 1200682613...
[416] Transforming document 1200682884...
[417] Transforming document 1200682886...
[418] Transforming document 1200682901...
[419] Transforming document 120068

[593] Transforming document 1200721892...
[594] Transforming document 1200721945...
[595] Transforming document 1200722047...
[596] Transforming document 1200722253...
[597] Transforming document 1200722258...
[598] Transforming document 1200722295...
[599] Transforming document 1200722326...
[600] Transforming document 1200722328...
[601] Transforming document 1200722330...
[602] Transforming document 1200722334...
[603] Transforming document 1200722358...
[604] Transforming document 1200722359...
[605] Transforming document 1200722364...
[606] Transforming document 1200722547...
[607] Transforming document 1200722793...
[608] Transforming document 1200723059...
[609] Transforming document 1200723183...
[610] Transforming document 1200723390...
[611] Transforming document 1200723546...
[612] Transforming document 1200724107...
[613] Transforming document 1200727256...
[614] Transforming document 1200729247...
[615] Transforming document 1200729963...
[616] Transforming document 120073

[790] Transforming document 1200799336...
[791] Transforming document 1200799472...
[792] Transforming document 1200799596...
[793] Transforming document 1200799937...
[794] Transforming document 1200799983...
[795] Transforming document 1200800030...
[796] Transforming document 1200800187...
[797] Transforming document 1200800593...
[798] Transforming document 1200801123...
[799] Transforming document 1200802836...
[800] Transforming document 1200802837...
[801] Transforming document 1200802863...
[802] Transforming document 1200802942...
[803] Transforming document 1200802954...
[804] Transforming document 1200803087...
[805] Transforming document 1200803126...
[806] Transforming document 1200803145...
[807] Transforming document 1200803155...
[808] Transforming document 1200803304...
[809] Transforming document 1200803339...
[810] Transforming document 1200803379...
[811] Transforming document 1200803473...
[812] Transforming document 1200803596...
[813] Transforming document 120080

[987] Transforming document 1200931342...
[988] Transforming document 1200931601...
[989] Transforming document 1200931650...
[990] Transforming document 1200932288...
[991] Transforming document 1200932289...
[992] Transforming document 1200932644...
[993] Transforming document 1200932646...
[994] Transforming document 1200933417...
[995] Transforming document 1200933610...
[996] Transforming document 1200933739...
[997] Transforming document 1200933740...
[998] Transforming document 1200943520...
[999] Transforming document 1200943521...
[1000] Transforming document 1200943522...
[1001] Transforming document 1200943523...
[1002] Transforming document 1200943526...
[1003] Transforming document 1200943528...
[1004] Transforming document 1200943530...
[1005] Transforming document 1200943533...
[1006] Transforming document 1200943534...
[1007] Transforming document 1200945552...
[1008] Transforming document 1200945553...
[1009] Transforming document 1200945554...
[1010] Transforming docu

[1178] Transforming document 1200983022...
[1179] Transforming document 1200983538...
[1180] Transforming document 1200983550...
[1181] Transforming document 1200983585...
[1182] Transforming document 1200983612...
[1183] Transforming document 1200983618...
[1184] Transforming document 1200984296...
[1185] Transforming document 1200984299...
[1186] Transforming document 1200984375...
[1187] Transforming document 1200984411...
[1188] Transforming document 1200984445...
[1189] Transforming document 1200984475...
[1190] Transforming document 1200984494...
[1191] Transforming document 1200984529...
[1192] Transforming document 1200984551...
[1193] Transforming document 1200984561...
[1194] Transforming document 1200984601...
[1195] Transforming document 1200984639...
[1196] Transforming document 1200984655...
[1197] Transforming document 1200984663...
[1198] Transforming document 1200984667...
[1199] Transforming document 1200984668...
[1200] Transforming document 1200984673...
[1201] Tran

[1370] Transforming document 1201019579...
[1371] Transforming document 1201019580...
[1372] Transforming document 1201019581...
[1373] Transforming document 1201019589...
[1374] Transforming document 1201019641...
[1375] Transforming document 1201019648...
[1376] Transforming document 1201019650...
[1377] Transforming document 1201019655...
[1378] Transforming document 1201019658...
[1379] Transforming document 1201019661...
[1380] Transforming document 1201019672...
[1381] Transforming document 1201019696...
[1382] Transforming document 1201019707...
[1383] Transforming document 1201019713...
[1384] Transforming document 1201019714...
[1385] Transforming document 1201019717...
[1386] Transforming document 1201019720...
[1387] Transforming document 1201019722...
[1388] Transforming document 1201019723...
[1389] Transforming document 1201019733...
[1390] Transforming document 1201019736...
[1391] Transforming document 1201019744...
[1392] Transforming document 1201019756...
[1393] Tran

[1561] Transforming document 1201048968...
[1562] Transforming document 1201049023...
[1563] Transforming document 1201049369...
[1564] Transforming document 1201049407...
[1565] Transforming document 1201049417...
[1566] Transforming document 1201049418...
[1567] Transforming document 1201049663...
[1568] Transforming document 1201049693...
[1569] Transforming document 1201049758...
[1570] Transforming document 1201049759...
[1571] Transforming document 1201049771...
[1572] Transforming document 1201049848...
[1573] Transforming document 1201049947...
[1574] Transforming document 1201050084...
[1575] Transforming document 1201050086...
[1576] Transforming document 1201050105...
[1577] Transforming document 1201050114...
[1578] Transforming document 1201050174...
[1579] Transforming document 1201056359...
[1580] Transforming document 1201058868...
[1581] Transforming document 1201058871...
[1582] Transforming document 1201058872...
[1583] Transforming document 1201058948...
[1584] Tran

[1752] Transforming document 1201094017...
[1753] Transforming document 1201094240...
[1754] Transforming document 1201094357...
[1755] Transforming document 1201094901...
[1756] Transforming document 1201095059...
[1757] Transforming document 1201095142...
[1758] Transforming document 1201095170...
[1759] Transforming document 1201095280...
[1760] Transforming document 1201095538...
[1761] Transforming document 1201095801...
[1762] Transforming document 1201096685...
[1763] Transforming document 1201098473...
[1764] Transforming document 1201098654...
[1765] Transforming document 1201098657...
[1766] Transforming document 1201098795...
[1767] Transforming document 1201099104...
[1768] Transforming document 1201099235...
[1769] Transforming document 1201099267...
[1770] Transforming document 1201099330...
[1771] Transforming document 1201099340...
[1772] Transforming document 1201099442...
[1773] Transforming document 1201099774...
[1774] Transforming document 1201099804...
[1775] Tran

[1943] Transforming document 1201138408...
[1944] Transforming document 1201138451...
[1945] Transforming document 1201138454...
[1946] Transforming document 1201138457...
[1947] Transforming document 1201138470...
[1948] Transforming document 1201138493...
[1949] Transforming document 1201138693...
[1950] Transforming document 1201138725...
[1951] Transforming document 1201138726...
[1952] Transforming document 1201138746...
[1953] Transforming document 1201138843...
[1954] Transforming document 1201138935...
[1955] Transforming document 1201138940...
[1956] Transforming document 1201139014...
[1957] Transforming document 1201139025...
[1958] Transforming document 1201139034...
[1959] Transforming document 1201139103...
[1960] Transforming document 1201139196...
[1961] Transforming document 1201139197...
[1962] Transforming document 1201139270...
[1963] Transforming document 1201139273...
[1964] Transforming document 1201139531...
[1965] Transforming document 1201139558...
[1966] Tran

[2134] Transforming document 1201185858...
[2135] Transforming document 1201185878...
[2136] Transforming document 1201185879...
[2137] Transforming document 1201186188...
[2138] Transforming document 1201186318...
[2139] Transforming document 1201186765...
[2140] Transforming document 1201186810...
[2141] Transforming document 1201187087...
[2142] Transforming document 1201187092...
[2143] Transforming document 1201187692...
[2144] Transforming document 1201189902...
[2145] Transforming document 1201189903...
[2146] Transforming document 1201189904...
[2147] Transforming document 1201189907...
[2148] Transforming document 1201189908...
[2149] Transforming document 1201189917...
[2150] Transforming document 1201189943...
[2151] Transforming document 1201189948...
[2152] Transforming document 1201189973...
[2153] Transforming document 1201190034...
[2154] Transforming document 1201190184...
[2155] Transforming document 1201190460...
[2156] Transforming document 1201190500...
[2157] Tran

[2325] Transforming document 1201251614...
[2326] Transforming document 1201251697...
[2327] Transforming document 1201252112...
[2328] Transforming document 1201252986...
[2329] Transforming document 1201253231...
[2330] Transforming document 1201253808...
[2331] Transforming document 1201254082...
[2332] Transforming document 1201254434...
[2333] Transforming document 1201258362...
[2334] Transforming document 1201258489...
[2335] Transforming document 1201258567...
[2336] Transforming document 1201258692...
[2337] Transforming document 1201258744...
[2338] Transforming document 1201258921...
[2339] Transforming document 1201259149...
[2340] Transforming document 1201259152...
[2341] Transforming document 1201259155...
[2342] Transforming document 1201259728...
[2343] Transforming document 1201260082...
[2344] Transforming document 1201260220...
[2345] Transforming document 1201260697...
[2346] Transforming document 1201260794...
[2347] Transforming document 1201261372...
[2348] Tran

[2516] Transforming document 1201331989...
[2517] Transforming document 1201331990...
[2518] Transforming document 1201332000...
[2519] Transforming document 1201332947...
[2520] Transforming document 1201332985...
[2521] Transforming document 1201333176...
[2522] Transforming document 1201333190...
[2523] Transforming document 1201333304...
[2524] Transforming document 1201333339...
[2525] Transforming document 1201333362...
[2526] Transforming document 1201333376...
[2527] Transforming document 1201333382...
[2528] Transforming document 1201333664...
[2529] Transforming document 1201333728...
[2530] Transforming document 1201334011...
[2531] Transforming document 1201334012...
[2532] Transforming document 1201334410...
[2533] Transforming document 1201334411...
[2534] Transforming document 1201334430...
[2535] Transforming document 1201334636...
[2536] Transforming document 1201335184...
[2537] Transforming document 1201336053...
[2538] Transforming document 1201336126...
[2539] Tran

[2707] Transforming document 1201428560...
[2708] Transforming document 1201428612...
[2709] Transforming document 1201428873...
[2710] Transforming document 1201428972...
[2711] Transforming document 1201429011...
[2712] Transforming document 1201431736...
[2713] Transforming document 1201431738...
[2714] Transforming document 1201432921...
[2715] Transforming document 1201433305...
[2716] Transforming document 1201433362...
[2717] Transforming document 1201433419...
[2718] Transforming document 1201433420...
[2719] Transforming document 1201433421...
[2720] Transforming document 1201433819...
[2721] Transforming document 1201433823...
[2722] Transforming document 1201433923...
[2723] Transforming document 1201434091...
[2724] Transforming document 1201434243...
[2725] Transforming document 1201434250...
[2726] Transforming document 1201434529...
[2727] Transforming document 1201436278...
[2728] Transforming document 1201436406...
[2729] Transforming document 1201436413...
[2730] Tran

[2898] Transforming document 1201514506...
[2899] Transforming document 1201514607...
[2900] Transforming document 1201516140...
[2901] Transforming document 1201516164...
[2902] Transforming document 1201516178...
[2903] Transforming document 1201516240...
[2904] Transforming document 1201516252...
[2905] Transforming document 1201516301...
[2906] Transforming document 1201516333...
[2907] Transforming document 1201516388...
[2908] Transforming document 1201516389...
[2909] Transforming document 1201516407...
[2910] Transforming document 1201516408...
[2911] Transforming document 1201516409...
[2912] Transforming document 1201517903...
[2913] Transforming document 1201517905...
[2914] Transforming document 1201517906...
[2915] Transforming document 1201517907...
[2916] Transforming document 1201517908...
[2917] Transforming document 1201517909...
[2918] Transforming document 1201517914...
[2919] Transforming document 1201517917...
[2920] Transforming document 1201517918...
[2921] Tran

[3089] Transforming document 1201577639...
[3090] Transforming document 1201577678...
[3091] Transforming document 1201577679...
[3092] Transforming document 1201577973...
[3093] Transforming document 1201578657...
[3094] Transforming document 1201580996...
[3095] Transforming document 1201581009...
[3096] Transforming document 1201581034...
[3097] Transforming document 1201581036...
[3098] Transforming document 1201581037...
[3099] Transforming document 1201581049...
[3100] Transforming document 1201581287...
[3101] Transforming document 1201581290...
[3102] Transforming document 1201581321...
[3103] Transforming document 1201581357...
[3104] Transforming document 1201581463...
[3105] Transforming document 1201581476...
[3106] Transforming document 1201581477...
[3107] Transforming document 1201581523...
[3108] Transforming document 1201581534...
[3109] Transforming document 1201581576...
[3110] Transforming document 1201581833...
[3111] Transforming document 1201581913...
[3112] Tran

[3280] Transforming document 1201613683...
[3281] Transforming document 1201613770...
[3282] Transforming document 1201613786...
[3283] Transforming document 1201614511...
[3284] Transforming document 1201618688...
[3285] Transforming document 1201618713...
[3286] Transforming document 1201618905...
[3287] Transforming document 1201618943...
[3288] Transforming document 1201619032...
[3289] Transforming document 1201619134...
[3290] Transforming document 1201619333...
[3291] Transforming document 1201619361...
[3292] Transforming document 1201619421...
[3293] Transforming document 1201619441...
[3294] Transforming document 1201619444...
[3295] Transforming document 1201619459...
[3296] Transforming document 1201619464...
[3297] Transforming document 1201619595...
[3298] Transforming document 1201619596...
[3299] Transforming document 1201619682...
[3300] Transforming document 1201619688...
[3301] Transforming document 1201619698...
[3302] Transforming document 1201620096...
[3303] Tran

[3471] Transforming document 1201676928...
[3472] Transforming document 1201682617...
[3473] Transforming document 1201684252...
[3474] Transforming document 1201688083...
[3475] Transforming document 1201688252...
[3476] Transforming document 1201688605...
[3477] Transforming document 1201689114...
[3478] Transforming document 1201690261...
[3479] Transforming document 1201693469...
[3480] Transforming document 1201693789...
[3481] Transforming document 1201694128...
[3482] Transforming document 1201694372...
[3483] Transforming document 1201694497...
[3484] Transforming document 1201694591...
[3485] Transforming document 1201695173...
[3486] Transforming document 1201695376...
[3487] Transforming document 1201695606...
[3488] Transforming document 1201697895...
[3489] Transforming document 1201697896...
[3490] Transforming document 1201698317...
[3491] Transforming document 1201698443...
[3492] Transforming document 1201698462...
[3493] Transforming document 1201698465...
[3494] Tran

[3662] Transforming document 1201742878...
[3663] Transforming document 1201742879...
[3664] Transforming document 1201742882...
[3665] Transforming document 1201742884...
[3666] Transforming document 1201742956...
[3667] Transforming document 1201743300...
[3668] Transforming document 1201743494...
[3669] Transforming document 1201743648...
[3670] Transforming document 1201743684...
[3671] Transforming document 1201743770...
[3672] Transforming document 1201743809...
[3673] Transforming document 1201743848...
[3674] Transforming document 1201744189...
[3675] Transforming document 1201744233...
[3676] Transforming document 1201744260...
[3677] Transforming document 1201744261...
[3678] Transforming document 1201744415...
[3679] Transforming document 1201744449...
[3680] Transforming document 1201744487...
[3681] Transforming document 1201744694...
[3682] Transforming document 1201744726...
[3683] Transforming document 1201744878...
[3684] Transforming document 1201745251...
[3685] Tran

[3853] Transforming document 1201756398...
[3854] Transforming document 1201756402...
[3855] Transforming document 1201756410...
[3856] Transforming document 1201756439...
[3857] Transforming document 1201756450...
[3858] Transforming document 1201756451...
[3859] Transforming document 1201756452...
[3860] Transforming document 1201756453...
[3861] Transforming document 1201756454...
[3862] Transforming document 1201756579...
[3863] Transforming document 1201756617...
[3864] Transforming document 1201756806...
[3865] Transforming document 1201756807...
[3866] Transforming document 1201756810...
[3867] Transforming document 1201756823...
[3868] Transforming document 1201756833...
[3869] Transforming document 1201756948...
[3870] Transforming document 1201756969...
[3871] Transforming document 1201757020...
[3872] Transforming document 1201757074...
[3873] Transforming document 1201757093...
[3874] Transforming document 1201757743...
[3875] Transforming document 1201757836...
[3876] Tran

[4044] Transforming document 1201766946...
[4045] Transforming document 1201767231...
[4046] Transforming document 1201767862...
[4047] Transforming document 1201767873...
[4048] Transforming document 1201767875...
[4049] Transforming document 1201767890...
[4050] Transforming document 1201767892...
[4051] Transforming document 1201767894...
[4052] Transforming document 1201767900...
[4053] Transforming document 1201767927...
[4054] Transforming document 1201767928...
[4055] Transforming document 1201767930...
[4056] Transforming document 1201767941...
[4057] Transforming document 1201767947...
[4058] Transforming document 1201770699...
[4059] Transforming document 1201770700...
[4060] Transforming document 1201770702...
[4061] Transforming document 1201770717...
[4062] Transforming document 1201770718...
[4063] Transforming document 1201770721...
[4064] Transforming document 1201770723...
[4065] Transforming document 1201770796...
[4066] Transforming document 1201770797...
[4067] Tran

[4235] Transforming document 1201782225...
[4236] Transforming document 1201782226...
[4237] Transforming document 1201782227...
[4238] Transforming document 1201782280...
[4239] Transforming document 1201782282...
[4240] Transforming document 1201782287...
[4241] Transforming document 1201782298...
[4242] Transforming document 1201782303...
[4243] Transforming document 1201782339...
[4244] Transforming document 1201782340...
[4245] Transforming document 1201782342...
[4246] Transforming document 1201782344...
[4247] Transforming document 1201782345...
[4248] Transforming document 1201782346...
[4249] Transforming document 1201782347...
[4250] Transforming document 1201782356...
[4251] Transforming document 1201782384...
[4252] Transforming document 1201782460...
[4253] Transforming document 1201782518...
[4254] Transforming document 1201782519...
[4255] Transforming document 1201782532...
[4256] Transforming document 1201782590...
[4257] Transforming document 1201782648...
[4258] Tran

[4426] Transforming document 1201795295...
[4427] Transforming document 1201795403...
[4428] Transforming document 1201795448...
[4429] Transforming document 1201795534...
[4430] Transforming document 1201795556...
[4431] Transforming document 1201795602...
[4432] Transforming document 1201795617...
[4433] Transforming document 1201795636...
[4434] Transforming document 1201795693...
[4435] Transforming document 1201795817...
[4436] Transforming document 1201795855...
[4437] Transforming document 1201795862...
[4438] Transforming document 1201795907...
[4439] Transforming document 1201795948...
[4440] Transforming document 1201795969...
[4441] Transforming document 1201795995...
[4442] Transforming document 1201796107...
[4443] Transforming document 1201796108...
[4444] Transforming document 1201796182...
[4445] Transforming document 1201796693...
[4446] Transforming document 1201798235...
[4447] Transforming document 1201798236...
[4448] Transforming document 1201798237...
[4449] Tran

[4617] Transforming document 1201826715...
[4618] Transforming document 1201826880...
[4619] Transforming document 1201826893...
[4620] Transforming document 1201827157...
[4621] Transforming document 1201827193...
[4622] Transforming document 1201827194...
[4623] Transforming document 1201827218...
[4624] Transforming document 1201827252...
[4625] Transforming document 1201827525...
[4626] Transforming document 1201827814...
[4627] Transforming document 1201829713...
[4628] Transforming document 1201829718...
[4629] Transforming document 1201829723...
[4630] Transforming document 1201829724...
[4631] Transforming document 1201829725...
[4632] Transforming document 1201829908...
[4633] Transforming document 1201829909...
[4634] Transforming document 1201829910...
[4635] Transforming document 1201829925...
[4636] Transforming document 1201829927...
[4637] Transforming document 1201830004...
[4638] Transforming document 1201830005...
[4639] Transforming document 1201830006...
[4640] Tran

[4808] Transforming document 1201853527...
[4809] Transforming document 1201853528...
[4810] Transforming document 1201853568...
[4811] Transforming document 1201853592...
[4812] Transforming document 1201853624...
[4813] Transforming document 1201853634...
[4814] Transforming document 1201853636...
[4815] Transforming document 1201853637...
[4816] Transforming document 1201853640...
[4817] Transforming document 1201853704...
[4818] Transforming document 1201853773...
[4819] Transforming document 1201853870...
[4820] Transforming document 1201853924...
[4821] Transforming document 1201853996...
[4822] Transforming document 1201854019...
[4823] Transforming document 1201854020...
[4824] Transforming document 1201854048...
[4825] Transforming document 1201854053...
[4826] Transforming document 1201854055...
[4827] Transforming document 1201854066...
[4828] Transforming document 1201854073...
[4829] Transforming document 1201854135...
[4830] Transforming document 1201854167...
[4831] Tran

[4999] Transforming document 1201885425...
[5000] Transforming document 1201885750...
[5001] Transforming document 1201885818...
[5002] Transforming document 1201886233...
[5003] Transforming document 1201886513...
[5004] Transforming document 1201886845...
[5005] Transforming document 1201887026...
[5006] Transforming document 1201887099...
[5007] Transforming document 1201887343...
[5008] Transforming document 1201889123...
[5009] Transforming document 1201889151...
[5010] Transforming document 1201889269...
[5011] Transforming document 1201889351...
[5012] Transforming document 1201889459...
[5013] Transforming document 1201889474...
[5014] Transforming document 1201889495...
[5015] Transforming document 1201889708...
[5016] Transforming document 1201889887...
[5017] Transforming document 1201889973...
[5018] Transforming document 1201890223...
[5019] Transforming document 1201890548...
[5020] Transforming document 1201890550...
[5021] Transforming document 1201891468...
[5022] Tran

[5190] Transforming document 1201916461...
[5191] Transforming document 1201916463...
[5192] Transforming document 1201916465...
[5193] Transforming document 1201916493...
[5194] Transforming document 1201916620...
[5195] Transforming document 1201916648...
[5196] Transforming document 1201916687...
[5197] Transforming document 1201916704...
[5198] Transforming document 1201916784...
[5199] Transforming document 1201916833...
[5200] Transforming document 1201916837...
[5201] Transforming document 1201917037...
[5202] Transforming document 1201917239...
[5203] Transforming document 1201917240...
[5204] Transforming document 1201917717...
[5205] Transforming document 1201917793...
[5206] Transforming document 1201917836...
[5207] Transforming document 1201917840...
[5208] Transforming document 1201917844...
[5209] Transforming document 1201917848...
[5210] Transforming document 1201917856...
[5211] Transforming document 1201917857...
[5212] Transforming document 1201917858...
[5213] Tran

[5381] Transforming document 1201934359...
[5382] Transforming document 1201934360...
[5383] Transforming document 1201934363...
[5384] Transforming document 1201934364...
[5385] Transforming document 1201934365...
[5386] Transforming document 1201934629...
[5387] Transforming document 1201934641...
[5388] Transforming document 1201934642...
[5389] Transforming document 1201934644...
[5390] Transforming document 1201934724...
[5391] Transforming document 1201934725...
[5392] Transforming document 1201934746...
[5393] Transforming document 1201934774...
[5394] Transforming document 1201934775...
[5395] Transforming document 1201934778...
[5396] Transforming document 1201934910...
[5397] Transforming document 1201934919...
[5398] Transforming document 1201934941...
[5399] Transforming document 1201934982...
[5400] Transforming document 1201935017...
[5401] Transforming document 1201935109...
[5402] Transforming document 1201935110...
[5403] Transforming document 1201935111...
[5404] Tran

[5572] Transforming document 1201971566...
[5573] Transforming document 1201971609...
[5574] Transforming document 1201971929...
[5575] Transforming document 1201972000...
[5576] Transforming document 1201972106...
[5577] Transforming document 1201972137...
[5578] Transforming document 1201972619...
[5579] Transforming document 1201974149...
[5580] Transforming document 1201977974...
[5581] Transforming document 1201979340...
[5582] Transforming document 1201979341...
[5583] Transforming document 1201980186...
[5584] Transforming document 1201980188...
[5585] Transforming document 1201980353...
[5586] Transforming document 1201980721...
[5587] Transforming document 1201981839...
[5588] Transforming document 1201982733...
[5589] Transforming document 1201982930...
[5590] Transforming document 1201983671...
[5591] Transforming document 1201983676...
[5592] Transforming document 1201983779...
[5593] Transforming document 1201984117...
[5594] Transforming document 1201984241...
[5595] Tran

[5763] Transforming document 1202013792...
[5764] Transforming document 1202013802...
[5765] Transforming document 1202013901...
[5766] Transforming document 1202016497...
[5767] Transforming document 1202016498...
[5768] Transforming document 1202016502...
[5769] Transforming document 1202016900...
[5770] Transforming document 1202017456...
[5771] Transforming document 1202017477...
[5772] Transforming document 1202017666...
[5773] Transforming document 1202017753...
[5774] Transforming document 1202017754...
[5775] Transforming document 1202017838...
[5776] Transforming document 1202017965...
[5777] Transforming document 1202018003...
[5778] Transforming document 1202018096...
[5779] Transforming document 1202018230...
[5780] Transforming document 1202020121...
[5781] Transforming document 1202020195...
[5782] Transforming document 1202020204...
[5783] Transforming document 1202020225...
[5784] Transforming document 1202020256...
[5785] Transforming document 1202020258...
[5786] Tran

[5954] Transforming document 1202043535...
[5955] Transforming document 1202043537...
[5956] Transforming document 1202043538...
[5957] Transforming document 1202043803...
[5958] Transforming document 1202044185...
[5959] Transforming document 1202044186...
[5960] Transforming document 1202045653...
[5961] Transforming document 1202045660...
[5962] Transforming document 1202045700...
[5963] Transforming document 1202046207...
[5964] Transforming document 1202046359...
[5965] Transforming document 1202046481...
[5966] Transforming document 1202046696...
[5967] Transforming document 1202046723...
[5968] Transforming document 1202046803...
[5969] Transforming document 1202048699...
[5970] Transforming document 1202049157...
[5971] Transforming document 1202049242...
[5972] Transforming document 1202049272...
[5973] Transforming document 1202049280...
[5974] Transforming document 1202049315...
[5975] Transforming document 1202051074...
[5976] Transforming document 1202051471...
[5977] Tran

[6145] Transforming document 1202111843...
[6146] Transforming document 1202117380...
[6147] Transforming document 1202117507...
[6148] Transforming document 1202117748...
[6149] Transforming document 1202117751...
[6150] Transforming document 1202117823...
[6151] Transforming document 1202119944...
[6152] Transforming document 1202121845...
[6153] Transforming document 1202121846...
[6154] Transforming document 1202121849...
[6155] Transforming document 1202121859...
[6156] Transforming document 1202121983...
[6157] Transforming document 1202122318...
[6158] Transforming document 1202122353...
[6159] Transforming document 1202122363...
[6160] Transforming document 1202122722...
[6161] Transforming document 1202122723...
[6162] Transforming document 1202131195...
[6163] Transforming document 1202131196...
[6164] Transforming document 1202131197...
[6165] Transforming document 1202131199...
[6166] Transforming document 1202131201...
[6167] Transforming document 1202131202...
[6168] Tran

### Finally...

这里我们需要读取真正的Y值，格式为excel文件，且仅有两列，一列是ID（比如文档号码），一列是Y值。

$\textbf{注意！}$ 读取Y值的文件里，ID的对应顺序要和提供训练数据的文档ID一致！一个简单的办法就是把Y值先按ID添加到原数据中，再进行分割即可。

In [None]:
def load_Y(file_path, col_name_ID, col_name_Y):
    global data
    df = pd.read_excel(file_path)
    if list(df[col_name_ID]) == list(data[col_name_ID]):
        print('训练数据与Y值的文档ID成功匹配！')
        return np.array(list(df[col_name_Y]))
    else:
        print('警告！训练数据与Y值的文档ID不匹配，请检查！')
        return None

In [None]:
y_path = '请替换成储存Y值文件的路径' # e.g. dummy_Y.xlsx
doc_ID_name = '请替换文档ID的名称' # e.g. 文档号码
y_name = '请替换Y值的名称' 
y = load_Y(Y_path, doc_ID_name, Y_name)

In [None]:
result = lr_coeffs(X, y)
# 储存结果
result.to_excel('word_ranking.xlsx')