## Linear Regression Model

(Based on bag_of_words representation)

In [237]:
import re
import jieba
import pandas as pd
import numpy as np
import json
from sklearn.feature_extraction import DictVectorizer
from datetime import datetime
from sklearn.linear_model import Ridge

### Stop words list

In [2]:
# global variables initialisation
with open('../stop_words/中文停用词表.txt', 'r', encoding='UTF-8-sig') as f:
    stop_words = [ word.strip().replace('\n', '') for word in f.readlines()]
symbols = stop_words[0:26]
print('e.g.', stop_words[23:33])

e.g. ['？', '.', '%', '一', '一些', '一何', '一切', '一则', '一方面', '一旦']


### Data

In [None]:
# Single input
data_path = '请输入数据路径'  # e.g. data/record1.xls
all_data = pd.read_excel(data_path)
all_data.info()

In [120]:
# Multiple inputs
# 如果能把所有的record数据都集成一个excel文件，就可以用上面的代码
all_data = pd.DataFrame()
for i in range(17):
    i += 1
    path = "../data/record" + str(i) + ".xls"
    all_data = all_data.append(pd.read_excel(path), sort=False)
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 58611 entries, 0 to 693
Data columns (total 13 columns):
文档号码                                58611 non-null int64
证券代码（请务必使用text格式 以保留代码中的0）          30412 non-null object
证券简称                                41984 non-null object
投资者关系活动类别                           41287 non-null object
参与机构数量（家）（0代表0个单位，空白代表无相关信息）        41992 non-null object
参与机构（请用分号隔开，并统一使用证券公司名称（申银万国 等））    30187 non-null object
参与投资者数量                             41987 non-null object
日期（格式统一为xx/xx/xxxx（日月年））            41992 non-null object
接待人员数量                              41992 non-null object
投资者关系活动主要内容介绍                       41986 non-null object
Unnamed: 10                         16619 non-null object
证券代码（请务必使用text格式以保留代码中的0）           11579 non-null object
参与机构（请用分号隔开；并统一使用证券公司名称（申银万国等））     11580 non-null object
dtypes: int64(1), object(12)
memory usage: 6.3+ MB


In [121]:
# 取出想要的数据并且去除空行
wanted_columns = ['文档号码','投资者关系活动主要内容介绍', '证券代码（请务必使用text格式 以保留代码中的0）', '日期（格式统一为xx/xx/xxxx（日月年））']
data = all_data[wanted_columns]
print('Before cleaning:')
print(data.shape)
data = data.dropna() # drop rows with null values
print('After cleaning:')
print(data.shape)

Before cleaning:
(58611, 4)
After cleaning:
(30406, 4)


### Merge Data by Cusip and Date

In [122]:
data = data.rename(columns={"文档号码": "ID", "投资者关系活动主要内容介绍": "content", 
                     "证券代码（请务必使用text格式 以保留代码中的0）": "cusip",
                     "日期（格式统一为xx/xx/xxxx（日月年））": "date"})
data = data.reset_index(drop=True)
ref_data = pd.read_excel('ref_data.xlsx')

有部分的date格式不对，进行提前清理

In [123]:
for i, dp in data.iterrows():
    try:
        str(dp['date'].strftime("%Y%m"))
    except:
        data.loc[i] = None

In [124]:
data = data.dropna().reset_index(drop=True)
data['date'] = data['date'].apply(lambda x: x.strftime("%Y%m"))
data['date'] = data['date'].apply(lambda x: str(x))
ref_data['cdt'] = ref_data['cdt'].apply(lambda x: str(x))

选取2012第一季度到2018第一季度的data并转换日期格式

In [125]:
# year-month to year-season
def yyyymm_to_yyyyss(yyyymm):
    
    assert len(yyyymm) == 6
    yyyy = yyyymm[:4]
    mm = yyyymm[-2:]
    if mm == '01' or mm == '02' or mm == '03':
        return yyyy + '01'
    elif mm == '04' or mm == '05' or mm == '06':
        return yyyy + '02'
    elif mm == '07' or mm == '08' or mm == '09':
        return yyyy + '03'
    elif mm == '10' or mm == '11' or mm == '12':
        return yyyy + '04'
    else:
        print('wrong month: {}'.format(mm))

yyyymm_to_yyyyss('201513')    

wrong month: 13


In [126]:
data['date'] = data['date'].apply(lambda x: yyyymm_to_yyyyss(x))

In [127]:
def keep_date(date):
    yyyy = int(date[:4])
    if 2012 <= yyyy and yyyy <= 2018:
        return date
    else:
        return None
    
np.sort(data['date'].apply(lambda x: keep_date(x)).dropna().unique())

array(['201201', '201202', '201203', '201204', '201301', '201302',
       '201303', '201304', '201401', '201402', '201403', '201404',
       '201501', '201502', '201503', '201504', '201601', '201602',
       '201603', '201604', '201701', '201702', '201703', '201704',
       '201801'], dtype=object)

In [128]:
data['date'] = data['date'].apply(lambda x: keep_date(x))
data = data.dropna()

至此data里仅保留2012第一季度到2018第一季度(2018最高只到二月份）的数据

In [129]:
len(np.unique(ref_data['cusip']))

298

In [130]:
count = 0
cus = []
def check(cusip):
    global count
    if len(cusip) == 6:
        return cusip
    else:
        count += 1
        # 根据cus里的pattern对cusip进行改造
        if cusip == '00':
            return None
        elif '、' in cusip or '，' in cusip or '（' in cusip:
            cusip = cusip[:6]
            assert cusip[:3] == '000'
            return cusip
        elif cusip == '00085':
            return '000085'
        elif 'SZ000338' in cusip:
            return '000338'
        elif len(cusip) > 6 and not re.findall('[1-9]', cusip[6]):
            return cusip[:6]
        else:
            cus.append(cusip)
            return None

In [131]:
print("Before:", len(data['cusip'].unique()), data['cusip'].shape)
print("After:", len(data['cusip'].apply(lambda x: check(x)).dropna().unique()), data['cusip'].apply(lambda x: check(x)).dropna().shape)
print(set(cus))

Before: 1631 (30348,)
After: 1613 (30283,)
{'0002875', '中能电气'}


In [132]:
data['cusip'] = data['cusip'].apply(lambda x: check(x)).dropna()
data = data.dropna()

In [134]:
data = data.rename(columns={"date": "yyyyss"})

In [133]:
data['cusip'] = data['cusip'].apply(lambda x: str(x))

In [136]:
for c in data['cusip']:
    if not len(c) == 6:
        print(c)

In [143]:
data = data.rename(columns={"yyyyss": "cdt"})
data.to_excel('all_records_2012_2018.xlsx', index=False)

自此拿到一份date按yyss格式,cusip都为六位数的干净records,其年份为2012-2018

In [144]:
data = pd.read_excel('all_records_2012_2018.xlsx', dtype= {'ID': str, 'content': str, 'cusip': str, 'cdt': str})
data.head(5)

Unnamed: 0,ID,content,cusip,cdt
0,1200573239,一、问题回答。\n1、非公开发行的进展情况如何？\n答：目前中国证监会正在审核公司非公开发行...,2635,201501
1,1200573241,会议主要关注BE部分。\n\n问题一：BE最近有何变化？\n回答：公司目前业务进展顺利，业务...,2594,201501
2,1200573242,会议主要关注BYD部分。\n\n问题一：公司新能源汽车的目前的进展情况？\n回答：公司将新能...,2594,201501
3,1200573245,问：请您谈谈公司2015年的收入季节性是否还会存在？\n答：公司2013、2014年的第三季...,2439,201501
4,1200573285,1、公司目前的出口情况？\n答：近几年来，公司加强了海外出口的投入力度，组建了专门的国际贸易...,2690,201501


In [198]:
name_Y = 'dtd'  # specify which Y value to take out
ref_data = pd.read_excel('ref_data.xlsx', dtype={'cdt': str, 'cusip': str, name_Y: float})[['cdt', 'cusip', name_Y]]
ref_data['cusip'] = ref_data['cusip'].apply(lambda x: x[:6])
ref_data = ref_data.dropna()
ref_data = ref_data.reset_index(drop=True)
ref_data.head(5)

Unnamed: 0,cdt,cusip,dtd
0,200701,2,2.534177
1,200702,2,4.025581
2,200703,2,4.839769
3,200704,2,5.073994
4,200801,2,4.257932


In [190]:
def find_content(cdt, cusip):
    global data
    related = data[(data['cdt'] == cdt) & (data['cusip'] == cusip)]
    content = ''.join(list(related['content']))
    if content:
        return content
    else:
        return None

find_content('201504', '000338')[:100]

'公司与来访者在公司会议室就关注问题进行了交流。交流主要内容包括：\n1. 公司收购凯傲林德的战略意义\n此次合作是一项具有里程碑意义的交易，也是迄今中国企业对德国最大的一笔直接投资。通过本次合作，公司与凯'

In [191]:
contents = []
for i, dp in ref_data.iterrows():
    contents.append(find_content(dp['cdt'], dp['cusip']))
    print('[{}]'.format(i), 'finished.')

[0] finished.
[1] finished.
[2] finished.
[3] finished.
[4] finished.
[5] finished.
[6] finished.
[7] finished.
[8] finished.
[9] finished.
[10] finished.
[11] finished.
[12] finished.
[13] finished.
[14] finished.
[15] finished.
[16] finished.
[17] finished.
[18] finished.
[19] finished.
[20] finished.
[21] finished.
[22] finished.
[23] finished.
[24] finished.
[25] finished.
[26] finished.
[27] finished.
[28] finished.
[29] finished.
[30] finished.
[31] finished.
[32] finished.
[33] finished.
[34] finished.
[35] finished.
[36] finished.
[37] finished.
[38] finished.
[39] finished.
[40] finished.
[41] finished.
[42] finished.
[43] finished.
[44] finished.
[45] finished.
[46] finished.
[47] finished.
[48] finished.
[49] finished.
[50] finished.
[51] finished.
[52] finished.
[53] finished.
[54] finished.
[55] finished.
[56] finished.
[57] finished.
[58] finished.
[59] finished.
[60] finished.
[61] finished.
[62] finished.
[63] finished.
[64] finished.
[65] finished.
[66] finished.
[67] 

[531] finished.
[532] finished.
[533] finished.
[534] finished.
[535] finished.
[536] finished.
[537] finished.
[538] finished.
[539] finished.
[540] finished.
[541] finished.
[542] finished.
[543] finished.
[544] finished.
[545] finished.
[546] finished.
[547] finished.
[548] finished.
[549] finished.
[550] finished.
[551] finished.
[552] finished.
[553] finished.
[554] finished.
[555] finished.
[556] finished.
[557] finished.
[558] finished.
[559] finished.
[560] finished.
[561] finished.
[562] finished.
[563] finished.
[564] finished.
[565] finished.
[566] finished.
[567] finished.
[568] finished.
[569] finished.
[570] finished.
[571] finished.
[572] finished.
[573] finished.
[574] finished.
[575] finished.
[576] finished.
[577] finished.
[578] finished.
[579] finished.
[580] finished.
[581] finished.
[582] finished.
[583] finished.
[584] finished.
[585] finished.
[586] finished.
[587] finished.
[588] finished.
[589] finished.
[590] finished.
[591] finished.
[592] finished.
[593] fi

[1059] finished.
[1060] finished.
[1061] finished.
[1062] finished.
[1063] finished.
[1064] finished.
[1065] finished.
[1066] finished.
[1067] finished.
[1068] finished.
[1069] finished.
[1070] finished.
[1071] finished.
[1072] finished.
[1073] finished.
[1074] finished.
[1075] finished.
[1076] finished.
[1077] finished.
[1078] finished.
[1079] finished.
[1080] finished.
[1081] finished.
[1082] finished.
[1083] finished.
[1084] finished.
[1085] finished.
[1086] finished.
[1087] finished.
[1088] finished.
[1089] finished.
[1090] finished.
[1091] finished.
[1092] finished.
[1093] finished.
[1094] finished.
[1095] finished.
[1096] finished.
[1097] finished.
[1098] finished.
[1099] finished.
[1100] finished.
[1101] finished.
[1102] finished.
[1103] finished.
[1104] finished.
[1105] finished.
[1106] finished.
[1107] finished.
[1108] finished.
[1109] finished.
[1110] finished.
[1111] finished.
[1112] finished.
[1113] finished.
[1114] finished.
[1115] finished.
[1116] finished.
[1117] finishe

[1550] finished.
[1551] finished.
[1552] finished.
[1553] finished.
[1554] finished.
[1555] finished.
[1556] finished.
[1557] finished.
[1558] finished.
[1559] finished.
[1560] finished.
[1561] finished.
[1562] finished.
[1563] finished.
[1564] finished.
[1565] finished.
[1566] finished.
[1567] finished.
[1568] finished.
[1569] finished.
[1570] finished.
[1571] finished.
[1572] finished.
[1573] finished.
[1574] finished.
[1575] finished.
[1576] finished.
[1577] finished.
[1578] finished.
[1579] finished.
[1580] finished.
[1581] finished.
[1582] finished.
[1583] finished.
[1584] finished.
[1585] finished.
[1586] finished.
[1587] finished.
[1588] finished.
[1589] finished.
[1590] finished.
[1591] finished.
[1592] finished.
[1593] finished.
[1594] finished.
[1595] finished.
[1596] finished.
[1597] finished.
[1598] finished.
[1599] finished.
[1600] finished.
[1601] finished.
[1602] finished.
[1603] finished.
[1604] finished.
[1605] finished.
[1606] finished.
[1607] finished.
[1608] finishe

[2056] finished.
[2057] finished.
[2058] finished.
[2059] finished.
[2060] finished.
[2061] finished.
[2062] finished.
[2063] finished.
[2064] finished.
[2065] finished.
[2066] finished.
[2067] finished.
[2068] finished.
[2069] finished.
[2070] finished.
[2071] finished.
[2072] finished.
[2073] finished.
[2074] finished.
[2075] finished.
[2076] finished.
[2077] finished.
[2078] finished.
[2079] finished.
[2080] finished.
[2081] finished.
[2082] finished.
[2083] finished.
[2084] finished.
[2085] finished.
[2086] finished.
[2087] finished.
[2088] finished.
[2089] finished.
[2090] finished.
[2091] finished.
[2092] finished.
[2093] finished.
[2094] finished.
[2095] finished.
[2096] finished.
[2097] finished.
[2098] finished.
[2099] finished.
[2100] finished.
[2101] finished.
[2102] finished.
[2103] finished.
[2104] finished.
[2105] finished.
[2106] finished.
[2107] finished.
[2108] finished.
[2109] finished.
[2110] finished.
[2111] finished.
[2112] finished.
[2113] finished.
[2114] finishe

[2543] finished.
[2544] finished.
[2545] finished.
[2546] finished.
[2547] finished.
[2548] finished.
[2549] finished.
[2550] finished.
[2551] finished.
[2552] finished.
[2553] finished.
[2554] finished.
[2555] finished.
[2556] finished.
[2557] finished.
[2558] finished.
[2559] finished.
[2560] finished.
[2561] finished.
[2562] finished.
[2563] finished.
[2564] finished.
[2565] finished.
[2566] finished.
[2567] finished.
[2568] finished.
[2569] finished.
[2570] finished.
[2571] finished.
[2572] finished.
[2573] finished.
[2574] finished.
[2575] finished.
[2576] finished.
[2577] finished.
[2578] finished.
[2579] finished.
[2580] finished.
[2581] finished.
[2582] finished.
[2583] finished.
[2584] finished.
[2585] finished.
[2586] finished.
[2587] finished.
[2588] finished.
[2589] finished.
[2590] finished.
[2591] finished.
[2592] finished.
[2593] finished.
[2594] finished.
[2595] finished.
[2596] finished.
[2597] finished.
[2598] finished.
[2599] finished.
[2600] finished.
[2601] finishe

[3040] finished.
[3041] finished.
[3042] finished.
[3043] finished.
[3044] finished.
[3045] finished.
[3046] finished.
[3047] finished.
[3048] finished.
[3049] finished.
[3050] finished.
[3051] finished.
[3052] finished.
[3053] finished.
[3054] finished.
[3055] finished.
[3056] finished.
[3057] finished.
[3058] finished.
[3059] finished.
[3060] finished.
[3061] finished.
[3062] finished.
[3063] finished.
[3064] finished.
[3065] finished.
[3066] finished.
[3067] finished.
[3068] finished.
[3069] finished.
[3070] finished.
[3071] finished.
[3072] finished.
[3073] finished.
[3074] finished.
[3075] finished.
[3076] finished.
[3077] finished.
[3078] finished.
[3079] finished.
[3080] finished.
[3081] finished.
[3082] finished.
[3083] finished.
[3084] finished.
[3085] finished.
[3086] finished.
[3087] finished.
[3088] finished.
[3089] finished.
[3090] finished.
[3091] finished.
[3092] finished.
[3093] finished.
[3094] finished.
[3095] finished.
[3096] finished.
[3097] finished.
[3098] finishe

[3541] finished.
[3542] finished.
[3543] finished.
[3544] finished.
[3545] finished.
[3546] finished.
[3547] finished.
[3548] finished.
[3549] finished.
[3550] finished.
[3551] finished.
[3552] finished.
[3553] finished.
[3554] finished.
[3555] finished.
[3556] finished.
[3557] finished.
[3558] finished.
[3559] finished.
[3560] finished.
[3561] finished.
[3562] finished.
[3563] finished.
[3564] finished.
[3565] finished.
[3566] finished.
[3567] finished.
[3568] finished.
[3569] finished.
[3570] finished.
[3571] finished.
[3572] finished.
[3573] finished.
[3574] finished.
[3575] finished.
[3576] finished.
[3577] finished.
[3578] finished.
[3579] finished.
[3580] finished.
[3581] finished.
[3582] finished.
[3583] finished.
[3584] finished.
[3585] finished.
[3586] finished.
[3587] finished.
[3588] finished.
[3589] finished.
[3590] finished.
[3591] finished.
[3592] finished.
[3593] finished.
[3594] finished.
[3595] finished.
[3596] finished.
[3597] finished.
[3598] finished.
[3599] finishe

[4025] finished.
[4026] finished.
[4027] finished.
[4028] finished.
[4029] finished.
[4030] finished.
[4031] finished.
[4032] finished.
[4033] finished.
[4034] finished.
[4035] finished.
[4036] finished.
[4037] finished.
[4038] finished.
[4039] finished.
[4040] finished.
[4041] finished.
[4042] finished.
[4043] finished.
[4044] finished.
[4045] finished.
[4046] finished.
[4047] finished.
[4048] finished.
[4049] finished.
[4050] finished.
[4051] finished.
[4052] finished.
[4053] finished.
[4054] finished.
[4055] finished.
[4056] finished.
[4057] finished.
[4058] finished.
[4059] finished.
[4060] finished.
[4061] finished.
[4062] finished.
[4063] finished.
[4064] finished.
[4065] finished.
[4066] finished.
[4067] finished.
[4068] finished.
[4069] finished.
[4070] finished.
[4071] finished.
[4072] finished.
[4073] finished.
[4074] finished.
[4075] finished.
[4076] finished.
[4077] finished.
[4078] finished.
[4079] finished.
[4080] finished.
[4081] finished.
[4082] finished.
[4083] finishe

[4510] finished.
[4511] finished.
[4512] finished.
[4513] finished.
[4514] finished.
[4515] finished.
[4516] finished.
[4517] finished.
[4518] finished.
[4519] finished.
[4520] finished.
[4521] finished.
[4522] finished.
[4523] finished.
[4524] finished.
[4525] finished.
[4526] finished.
[4527] finished.
[4528] finished.
[4529] finished.
[4530] finished.
[4531] finished.
[4532] finished.
[4533] finished.
[4534] finished.
[4535] finished.
[4536] finished.
[4537] finished.
[4538] finished.
[4539] finished.
[4540] finished.
[4541] finished.
[4542] finished.
[4543] finished.
[4544] finished.
[4545] finished.
[4546] finished.
[4547] finished.
[4548] finished.
[4549] finished.
[4550] finished.
[4551] finished.
[4552] finished.
[4553] finished.
[4554] finished.
[4555] finished.
[4556] finished.
[4557] finished.
[4558] finished.
[4559] finished.
[4560] finished.
[4561] finished.
[4562] finished.
[4563] finished.
[4564] finished.
[4565] finished.
[4566] finished.
[4567] finished.
[4568] finishe

[5018] finished.
[5019] finished.
[5020] finished.
[5021] finished.
[5022] finished.
[5023] finished.
[5024] finished.
[5025] finished.
[5026] finished.
[5027] finished.
[5028] finished.
[5029] finished.
[5030] finished.
[5031] finished.
[5032] finished.
[5033] finished.
[5034] finished.
[5035] finished.
[5036] finished.
[5037] finished.
[5038] finished.
[5039] finished.
[5040] finished.
[5041] finished.
[5042] finished.
[5043] finished.
[5044] finished.
[5045] finished.
[5046] finished.
[5047] finished.
[5048] finished.
[5049] finished.
[5050] finished.
[5051] finished.
[5052] finished.
[5053] finished.
[5054] finished.
[5055] finished.
[5056] finished.
[5057] finished.
[5058] finished.
[5059] finished.
[5060] finished.
[5061] finished.
[5062] finished.
[5063] finished.
[5064] finished.
[5065] finished.
[5066] finished.
[5067] finished.
[5068] finished.
[5069] finished.
[5070] finished.
[5071] finished.
[5072] finished.
[5073] finished.
[5074] finished.
[5075] finished.
[5076] finishe

[5510] finished.
[5511] finished.
[5512] finished.
[5513] finished.
[5514] finished.
[5515] finished.
[5516] finished.
[5517] finished.
[5518] finished.
[5519] finished.
[5520] finished.
[5521] finished.
[5522] finished.
[5523] finished.
[5524] finished.
[5525] finished.
[5526] finished.
[5527] finished.
[5528] finished.
[5529] finished.
[5530] finished.
[5531] finished.
[5532] finished.
[5533] finished.
[5534] finished.
[5535] finished.
[5536] finished.
[5537] finished.
[5538] finished.
[5539] finished.
[5540] finished.
[5541] finished.
[5542] finished.
[5543] finished.
[5544] finished.
[5545] finished.
[5546] finished.
[5547] finished.
[5548] finished.
[5549] finished.
[5550] finished.
[5551] finished.
[5552] finished.
[5553] finished.
[5554] finished.
[5555] finished.
[5556] finished.
[5557] finished.
[5558] finished.
[5559] finished.
[5560] finished.
[5561] finished.
[5562] finished.
[5563] finished.
[5564] finished.
[5565] finished.
[5566] finished.
[5567] finished.
[5568] finishe

[6021] finished.
[6022] finished.
[6023] finished.
[6024] finished.
[6025] finished.
[6026] finished.
[6027] finished.
[6028] finished.
[6029] finished.
[6030] finished.
[6031] finished.
[6032] finished.
[6033] finished.
[6034] finished.
[6035] finished.
[6036] finished.
[6037] finished.
[6038] finished.
[6039] finished.
[6040] finished.
[6041] finished.
[6042] finished.
[6043] finished.
[6044] finished.
[6045] finished.
[6046] finished.
[6047] finished.
[6048] finished.
[6049] finished.
[6050] finished.
[6051] finished.
[6052] finished.
[6053] finished.
[6054] finished.
[6055] finished.
[6056] finished.
[6057] finished.
[6058] finished.
[6059] finished.
[6060] finished.
[6061] finished.
[6062] finished.
[6063] finished.
[6064] finished.
[6065] finished.
[6066] finished.
[6067] finished.
[6068] finished.
[6069] finished.
[6070] finished.
[6071] finished.
[6072] finished.
[6073] finished.
[6074] finished.
[6075] finished.
[6076] finished.
[6077] finished.
[6078] finished.
[6079] finishe

[6506] finished.
[6507] finished.
[6508] finished.
[6509] finished.
[6510] finished.
[6511] finished.
[6512] finished.
[6513] finished.
[6514] finished.
[6515] finished.
[6516] finished.
[6517] finished.
[6518] finished.
[6519] finished.
[6520] finished.
[6521] finished.
[6522] finished.
[6523] finished.
[6524] finished.
[6525] finished.
[6526] finished.
[6527] finished.
[6528] finished.
[6529] finished.
[6530] finished.
[6531] finished.
[6532] finished.
[6533] finished.
[6534] finished.
[6535] finished.
[6536] finished.
[6537] finished.
[6538] finished.
[6539] finished.
[6540] finished.
[6541] finished.
[6542] finished.
[6543] finished.
[6544] finished.
[6545] finished.
[6546] finished.
[6547] finished.
[6548] finished.
[6549] finished.
[6550] finished.
[6551] finished.
[6552] finished.
[6553] finished.
[6554] finished.
[6555] finished.
[6556] finished.
[6557] finished.
[6558] finished.
[6559] finished.
[6560] finished.
[6561] finished.
[6562] finished.
[6563] finished.
[6564] finishe

[7016] finished.
[7017] finished.
[7018] finished.
[7019] finished.
[7020] finished.
[7021] finished.
[7022] finished.
[7023] finished.
[7024] finished.
[7025] finished.
[7026] finished.
[7027] finished.
[7028] finished.
[7029] finished.
[7030] finished.
[7031] finished.
[7032] finished.
[7033] finished.
[7034] finished.
[7035] finished.
[7036] finished.
[7037] finished.
[7038] finished.
[7039] finished.
[7040] finished.
[7041] finished.
[7042] finished.
[7043] finished.
[7044] finished.
[7045] finished.
[7046] finished.
[7047] finished.
[7048] finished.
[7049] finished.
[7050] finished.
[7051] finished.
[7052] finished.
[7053] finished.
[7054] finished.
[7055] finished.
[7056] finished.
[7057] finished.
[7058] finished.
[7059] finished.
[7060] finished.
[7061] finished.
[7062] finished.
[7063] finished.
[7064] finished.
[7065] finished.
[7066] finished.
[7067] finished.
[7068] finished.
[7069] finished.
[7070] finished.
[7071] finished.
[7072] finished.
[7073] finished.
[7074] finishe

[7502] finished.
[7503] finished.
[7504] finished.
[7505] finished.
[7506] finished.
[7507] finished.
[7508] finished.
[7509] finished.
[7510] finished.
[7511] finished.
[7512] finished.
[7513] finished.
[7514] finished.
[7515] finished.
[7516] finished.
[7517] finished.
[7518] finished.
[7519] finished.
[7520] finished.
[7521] finished.
[7522] finished.
[7523] finished.
[7524] finished.
[7525] finished.
[7526] finished.
[7527] finished.
[7528] finished.
[7529] finished.
[7530] finished.
[7531] finished.
[7532] finished.
[7533] finished.
[7534] finished.
[7535] finished.
[7536] finished.
[7537] finished.
[7538] finished.
[7539] finished.
[7540] finished.
[7541] finished.
[7542] finished.
[7543] finished.
[7544] finished.
[7545] finished.
[7546] finished.
[7547] finished.
[7548] finished.
[7549] finished.
[7550] finished.
[7551] finished.
[7552] finished.
[7553] finished.
[7554] finished.
[7555] finished.
[7556] finished.
[7557] finished.
[7558] finished.
[7559] finished.
[7560] finishe

[7997] finished.
[7998] finished.
[7999] finished.
[8000] finished.
[8001] finished.
[8002] finished.
[8003] finished.
[8004] finished.
[8005] finished.
[8006] finished.
[8007] finished.
[8008] finished.
[8009] finished.
[8010] finished.
[8011] finished.
[8012] finished.
[8013] finished.
[8014] finished.
[8015] finished.
[8016] finished.
[8017] finished.
[8018] finished.
[8019] finished.
[8020] finished.
[8021] finished.
[8022] finished.
[8023] finished.
[8024] finished.
[8025] finished.
[8026] finished.
[8027] finished.
[8028] finished.
[8029] finished.
[8030] finished.
[8031] finished.
[8032] finished.
[8033] finished.
[8034] finished.
[8035] finished.
[8036] finished.
[8037] finished.
[8038] finished.
[8039] finished.
[8040] finished.
[8041] finished.
[8042] finished.
[8043] finished.
[8044] finished.
[8045] finished.
[8046] finished.
[8047] finished.
[8048] finished.
[8049] finished.
[8050] finished.
[8051] finished.
[8052] finished.
[8053] finished.
[8054] finished.
[8055] finishe

[8488] finished.
[8489] finished.
[8490] finished.
[8491] finished.
[8492] finished.
[8493] finished.
[8494] finished.
[8495] finished.
[8496] finished.
[8497] finished.
[8498] finished.
[8499] finished.
[8500] finished.
[8501] finished.
[8502] finished.
[8503] finished.
[8504] finished.
[8505] finished.
[8506] finished.
[8507] finished.
[8508] finished.
[8509] finished.
[8510] finished.
[8511] finished.
[8512] finished.
[8513] finished.
[8514] finished.
[8515] finished.
[8516] finished.
[8517] finished.
[8518] finished.
[8519] finished.
[8520] finished.
[8521] finished.
[8522] finished.
[8523] finished.
[8524] finished.
[8525] finished.
[8526] finished.
[8527] finished.
[8528] finished.
[8529] finished.
[8530] finished.
[8531] finished.
[8532] finished.
[8533] finished.
[8534] finished.
[8535] finished.
[8536] finished.
[8537] finished.
[8538] finished.
[8539] finished.
[8540] finished.
[8541] finished.
[8542] finished.
[8543] finished.
[8544] finished.
[8545] finished.
[8546] finishe

[8977] finished.
[8978] finished.
[8979] finished.
[8980] finished.
[8981] finished.
[8982] finished.
[8983] finished.
[8984] finished.
[8985] finished.
[8986] finished.
[8987] finished.
[8988] finished.
[8989] finished.
[8990] finished.
[8991] finished.
[8992] finished.
[8993] finished.
[8994] finished.
[8995] finished.
[8996] finished.
[8997] finished.
[8998] finished.
[8999] finished.
[9000] finished.
[9001] finished.
[9002] finished.
[9003] finished.
[9004] finished.
[9005] finished.
[9006] finished.
[9007] finished.
[9008] finished.
[9009] finished.
[9010] finished.
[9011] finished.
[9012] finished.
[9013] finished.
[9014] finished.
[9015] finished.
[9016] finished.
[9017] finished.
[9018] finished.
[9019] finished.
[9020] finished.
[9021] finished.
[9022] finished.
[9023] finished.
[9024] finished.
[9025] finished.
[9026] finished.
[9027] finished.
[9028] finished.
[9029] finished.
[9030] finished.
[9031] finished.
[9032] finished.
[9033] finished.
[9034] finished.
[9035] finishe

[9490] finished.
[9491] finished.
[9492] finished.
[9493] finished.
[9494] finished.
[9495] finished.
[9496] finished.
[9497] finished.
[9498] finished.
[9499] finished.
[9500] finished.
[9501] finished.
[9502] finished.
[9503] finished.
[9504] finished.
[9505] finished.
[9506] finished.
[9507] finished.
[9508] finished.
[9509] finished.
[9510] finished.
[9511] finished.
[9512] finished.
[9513] finished.
[9514] finished.
[9515] finished.
[9516] finished.
[9517] finished.
[9518] finished.
[9519] finished.
[9520] finished.
[9521] finished.
[9522] finished.
[9523] finished.
[9524] finished.
[9525] finished.
[9526] finished.
[9527] finished.
[9528] finished.
[9529] finished.
[9530] finished.
[9531] finished.
[9532] finished.
[9533] finished.
[9534] finished.
[9535] finished.
[9536] finished.
[9537] finished.
[9538] finished.
[9539] finished.
[9540] finished.
[9541] finished.
[9542] finished.
[9543] finished.
[9544] finished.
[9545] finished.
[9546] finished.
[9547] finished.
[9548] finishe

[9990] finished.
[9991] finished.
[9992] finished.
[9993] finished.
[9994] finished.
[9995] finished.
[9996] finished.
[9997] finished.
[9998] finished.
[9999] finished.
[10000] finished.
[10001] finished.
[10002] finished.
[10003] finished.
[10004] finished.
[10005] finished.
[10006] finished.
[10007] finished.
[10008] finished.
[10009] finished.
[10010] finished.
[10011] finished.
[10012] finished.
[10013] finished.
[10014] finished.
[10015] finished.
[10016] finished.
[10017] finished.
[10018] finished.
[10019] finished.
[10020] finished.
[10021] finished.
[10022] finished.
[10023] finished.
[10024] finished.
[10025] finished.
[10026] finished.
[10027] finished.
[10028] finished.
[10029] finished.
[10030] finished.
[10031] finished.
[10032] finished.
[10033] finished.
[10034] finished.
[10035] finished.
[10036] finished.
[10037] finished.
[10038] finished.
[10039] finished.
[10040] finished.
[10041] finished.
[10042] finished.
[10043] finished.
[10044] finished.
[10045] finished.
[1

[10449] finished.
[10450] finished.
[10451] finished.
[10452] finished.
[10453] finished.
[10454] finished.
[10455] finished.
[10456] finished.
[10457] finished.
[10458] finished.
[10459] finished.
[10460] finished.
[10461] finished.
[10462] finished.
[10463] finished.
[10464] finished.
[10465] finished.
[10466] finished.
[10467] finished.
[10468] finished.
[10469] finished.
[10470] finished.
[10471] finished.
[10472] finished.
[10473] finished.
[10474] finished.
[10475] finished.
[10476] finished.
[10477] finished.
[10478] finished.
[10479] finished.
[10480] finished.
[10481] finished.
[10482] finished.
[10483] finished.
[10484] finished.
[10485] finished.
[10486] finished.
[10487] finished.
[10488] finished.
[10489] finished.
[10490] finished.
[10491] finished.
[10492] finished.
[10493] finished.
[10494] finished.
[10495] finished.
[10496] finished.
[10497] finished.
[10498] finished.
[10499] finished.
[10500] finished.
[10501] finished.
[10502] finished.
[10503] finished.
[10504] fi

[10910] finished.
[10911] finished.
[10912] finished.
[10913] finished.
[10914] finished.
[10915] finished.
[10916] finished.
[10917] finished.
[10918] finished.
[10919] finished.
[10920] finished.
[10921] finished.
[10922] finished.
[10923] finished.
[10924] finished.
[10925] finished.
[10926] finished.
[10927] finished.
[10928] finished.
[10929] finished.
[10930] finished.
[10931] finished.
[10932] finished.
[10933] finished.
[10934] finished.
[10935] finished.
[10936] finished.
[10937] finished.
[10938] finished.
[10939] finished.
[10940] finished.
[10941] finished.
[10942] finished.
[10943] finished.
[10944] finished.
[10945] finished.
[10946] finished.
[10947] finished.
[10948] finished.
[10949] finished.
[10950] finished.
[10951] finished.
[10952] finished.
[10953] finished.
[10954] finished.
[10955] finished.
[10956] finished.
[10957] finished.
[10958] finished.
[10959] finished.
[10960] finished.
[10961] finished.
[10962] finished.
[10963] finished.
[10964] finished.
[10965] fi

In [199]:
ref_data['content'] = contents
ref_data = ref_data.dropna().reset_index(drop=True)

In [212]:
Y = ref_data['dtd']

### Preprocessing

In [202]:
# 数据预处理所需要的所有方法

# clean the document, only Chinese characters, Numbers and Punctuations are left.
def clean(doc):
    chi = r'([\u4E00-\u9FA5]|[0-9]|[“”、。《》！，：；？\.%])'
    pa = re.compile(chi)
    return "".join(re.findall(pa, doc))

# sentence segmentation
def sent_seg(cleaned_doc):
    sent_pa = re.compile(r'.+?[？。！]')
    return re.findall(sent_pa, cleaned_doc)

def pure_sent(sent):
    cleaned_sent_pa = re.compile(r'([\u4E00-\u9FA5])')
    return ''.join(re.findall(cleaned_sent_pa, sent))
        
# Size of a doc is defined as the total number of valid Chinese characters
def raw_process(doc):
    cleaned_doc = clean(doc)
    sents = sent_seg(cleaned_doc)
    if not cleaned_doc or not len(sents):
        return {
            'sents': [],
            'size': 0,
            'avg_sent_len' : 0
        }
    else:
        total_length = sum([len(pure_sent(sent)) for sent in sents])
        avg_sent_length = total_length / len(sents)
        return {
            'sents': sents,
            'size' : total_length,
            'avg_sent_len' : avg_sent_length 
        }

# generate frequency distribution for each document, vital step for bag_of_words representation
def gen_freq_dist(doc):
    stat = raw_process(doc)
    sents = stat['sents']
    freq_dist = dict()
    pa = re.compile(r'([$0123456789?_“”、。《》！，：；？\.%])')
    for sent in sents:
        # calculate sent length after
        words = jieba.cut(sent, cut_all=False, HMM=True)
        for word in words:
            # ignore all the stop words
            if (not word in stop_words) and (not re.findall(pa, word)):
                freq_dist.setdefault(word, 0)
                freq_dist[word] += 1
    return { 'freq_dist' : freq_dist, 
             'size' : stat['size'],
             'avg_sent_len' : stat['avg_sent_len'],
             'n_sents' : len(sents)
           }

### Full frequencey distribution

1. 如果已经对当前的data完整运行过readability，请将readability文件夹里面的all_freq_dist.json复制到regression文件夹里，从而复用数据，并使用下数第二个cell进行读取。
2. 如果尚未生成当前data的完整freq_dist，请跑一次下数第一个cell（One-time-block）进行生成。

In [None]:
# One-time block 
# 建立一个完整的 frequency distribution，推荐只跑一次将数据储存以复用
def init_all_freq_dist():
    count = 0
    all_freq_dist = dict()
    for index, d in data.iterrows():
        print('[' + str(count) + '] Processing document ' + str(d['文档号码']) + '...')
        fd = gen_freq_dist(d[1])['freq_dist']
        for k in fd.keys():
            all_freq_dist.setdefault(k, 0)
            all_freq_dist[k] += fd[k]
        count += 1
    return all_freq_dist

all_freq_dist = init_all_freq_dist()
with open('all_freq_dist.json', 'w+', encoding='UTF-8-sig') as f:
    json.dump(all_freq_dist, f)

In [203]:
# 如果前一个cell已经完整跑完一次，只需要跑这个cell就能拿到完整的 frequency distribution
with open('all_freq_dist.json', 'r', encoding='UTF-8-sig') as f:
    all_freq_dist = json.load(f)
all_freq_dist_df = pd.DataFrame.from_dict(all_freq_dist, orient='index', columns=['freq'])
print('Most frequent word is: ' + str(np.argmax(all_freq_dist_df['freq'])))
all_freq_dist_df.describe()

Most frequent word is: 公司


The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.
  return getattr(obj, method)(*args, **kwds)


Unnamed: 0,freq
count,128918.0
mean,137.010472
std,2841.91742
min,1.0
25%,1.0
50%,3.0
75%,13.0
max,770176.0


### Bag of words construction

有了完整的freq_dist以后就能把每个document都转换成bag_of_words形式。

In [204]:
# 先把frequency低于3的全部去掉
low_freq_words = [word for word in all_freq_dist.keys() if all_freq_dist[word] <= 3]
for lw in low_freq_words:
    del all_freq_dist[lw]
print('Remaining number of words:', len(all_freq_dist.keys()))

Remaining number of words: 59536


In [205]:
vec = DictVectorizer()
all_bow = vec.fit_transform(all_freq_dist).toarray()
print('e.g.', vec.get_feature_names()[12000:12010])

e.g. ['化已', '化建', '化强', '化成', '化整为零', '化新', '化是', '化机', '化机制', '化来']


In [238]:
def gen_bag_of_words(doc):
    global vec
    return vec.transform(gen_freq_dist(doc)['freq_dist']).toarray()

def all_bag_of_words(data, col_name_for_docs, limited=False):
    global vec
    dimension = len(vec.get_feature_names())
    count = 0
    init = True
    X = []
    for index, data_point in data.iterrows():
        # 文档号码如果不存在将以下print注释，或者替换成另外指明数据的列
        print('[' + str(count) + '] Transforming document...')
        # Initialise X with first document
        if init:
            X = gen_bag_of_words(data_point[col_name_for_docs])
            init = False
        else:
            X = np.vstack((X, gen_bag_of_words(data_point[col_name_for_docs])))
        count += 1
        
        if limited and count == 1000:
            # For test use, just use the first 1000 rows
            break
    
    return X

# returning the coeffcients of the linear regression model after fitting X and y
def lr_coeffs(X, y):
    global vec
    features = list(vec.get_feature_names())
    clf = Ridge(alpha=1.0).fit(X, y)
    coeffs = list(clf.coef_)
    result = pd.DataFrame(columns=['Feature', 'Coefficients'])
    result['Feature'] = features
    result['Coefficients'] = coeffs
    return result.sort_values(by=['Coefficients'], ascending=False)

In [None]:
# Simply for testing
test_X = all_bag_of_words('投资者关系活动主要内容介绍', limited=True)
test_y = np.dot(test_X, np.array([1, 2] * int(59536/2))) + 3
dummy = pd.DataFrame()
dummy['Y'] = list(test_y)
dummy['文档号码'] = list(data['文档号码'][:1000])
dummy.to_excel('dummy_Y.xlsx')
test_result = lr_coeffs(test_X, test_y)

In [214]:
X = all_bag_of_words(ref_data, 'content', limited=False)

Building prefix dict from the default dictionary ...


[0] Transforming document...


Dumping model to file cache C:\Users\lawhy\AppData\Local\Temp\jieba.cache
Loading model cost 0.784 seconds.
Prefix dict has been built successfully.


[1] Transforming document...
[2] Transforming document...
[3] Transforming document...
[4] Transforming document...
[5] Transforming document...
[6] Transforming document...
[7] Transforming document...
[8] Transforming document...
[9] Transforming document...
[10] Transforming document...
[11] Transforming document...
[12] Transforming document...
[13] Transforming document...
[14] Transforming document...
[15] Transforming document...
[16] Transforming document...
[17] Transforming document...
[18] Transforming document...
[19] Transforming document...
[20] Transforming document...
[21] Transforming document...
[22] Transforming document...
[23] Transforming document...
[24] Transforming document...
[25] Transforming document...
[26] Transforming document...
[27] Transforming document...
[28] Transforming document...
[29] Transforming document...
[30] Transforming document...
[31] Transforming document...
[32] Transforming document...
[33] Transforming document...
[34] Transforming d

[270] Transforming document...
[271] Transforming document...
[272] Transforming document...
[273] Transforming document...
[274] Transforming document...
[275] Transforming document...
[276] Transforming document...
[277] Transforming document...
[278] Transforming document...
[279] Transforming document...
[280] Transforming document...
[281] Transforming document...
[282] Transforming document...
[283] Transforming document...
[284] Transforming document...
[285] Transforming document...
[286] Transforming document...
[287] Transforming document...
[288] Transforming document...
[289] Transforming document...
[290] Transforming document...
[291] Transforming document...
[292] Transforming document...
[293] Transforming document...
[294] Transforming document...
[295] Transforming document...
[296] Transforming document...
[297] Transforming document...
[298] Transforming document...
[299] Transforming document...
[300] Transforming document...
[301] Transforming document...
[302] Tr

[536] Transforming document...
[537] Transforming document...
[538] Transforming document...
[539] Transforming document...
[540] Transforming document...
[541] Transforming document...
[542] Transforming document...
[543] Transforming document...
[544] Transforming document...
[545] Transforming document...
[546] Transforming document...
[547] Transforming document...
[548] Transforming document...
[549] Transforming document...
[550] Transforming document...
[551] Transforming document...
[552] Transforming document...
[553] Transforming document...
[554] Transforming document...
[555] Transforming document...
[556] Transforming document...
[557] Transforming document...
[558] Transforming document...
[559] Transforming document...
[560] Transforming document...
[561] Transforming document...
[562] Transforming document...
[563] Transforming document...
[564] Transforming document...
[565] Transforming document...
[566] Transforming document...
[567] Transforming document...
[568] Tr

[801] Transforming document...
[802] Transforming document...
[803] Transforming document...
[804] Transforming document...
[805] Transforming document...
[806] Transforming document...
[807] Transforming document...
[808] Transforming document...
[809] Transforming document...
[810] Transforming document...
[811] Transforming document...
[812] Transforming document...
[813] Transforming document...
[814] Transforming document...
[815] Transforming document...
[816] Transforming document...
[817] Transforming document...
[818] Transforming document...
[819] Transforming document...
[820] Transforming document...
[821] Transforming document...
[822] Transforming document...
[823] Transforming document...
[824] Transforming document...
[825] Transforming document...
[826] Transforming document...
[827] Transforming document...
[828] Transforming document...
[829] Transforming document...
[830] Transforming document...
[831] Transforming document...
[832] Transforming document...
[833] Tr

[1065] Transforming document...
[1066] Transforming document...
[1067] Transforming document...
[1068] Transforming document...
[1069] Transforming document...
[1070] Transforming document...
[1071] Transforming document...
[1072] Transforming document...
[1073] Transforming document...
[1074] Transforming document...
[1075] Transforming document...
[1076] Transforming document...
[1077] Transforming document...
[1078] Transforming document...
[1079] Transforming document...
[1080] Transforming document...
[1081] Transforming document...
[1082] Transforming document...
[1083] Transforming document...
[1084] Transforming document...
[1085] Transforming document...
[1086] Transforming document...
[1087] Transforming document...
[1088] Transforming document...
[1089] Transforming document...
[1090] Transforming document...
[1091] Transforming document...
[1092] Transforming document...
[1093] Transforming document...
[1094] Transforming document...
[1095] Transforming document...
[1096] T

[1321] Transforming document...
[1322] Transforming document...
[1323] Transforming document...
[1324] Transforming document...
[1325] Transforming document...
[1326] Transforming document...
[1327] Transforming document...
[1328] Transforming document...
[1329] Transforming document...
[1330] Transforming document...
[1331] Transforming document...
[1332] Transforming document...
[1333] Transforming document...
[1334] Transforming document...
[1335] Transforming document...
[1336] Transforming document...
[1337] Transforming document...
[1338] Transforming document...
[1339] Transforming document...
[1340] Transforming document...
[1341] Transforming document...
[1342] Transforming document...
[1343] Transforming document...
[1344] Transforming document...
[1345] Transforming document...
[1346] Transforming document...
[1347] Transforming document...
[1348] Transforming document...
[1349] Transforming document...
[1350] Transforming document...
[1351] Transforming document...
[1352] T

[1578] Transforming document...
[1579] Transforming document...
[1580] Transforming document...
[1581] Transforming document...
[1582] Transforming document...
[1583] Transforming document...
[1584] Transforming document...
[1585] Transforming document...
[1586] Transforming document...
[1587] Transforming document...
[1588] Transforming document...
[1589] Transforming document...
[1590] Transforming document...
[1591] Transforming document...
[1592] Transforming document...
[1593] Transforming document...
[1594] Transforming document...
[1595] Transforming document...
[1596] Transforming document...
[1597] Transforming document...
[1598] Transforming document...
[1599] Transforming document...
[1600] Transforming document...
[1601] Transforming document...
[1602] Transforming document...
[1603] Transforming document...
[1604] Transforming document...
[1605] Transforming document...
[1606] Transforming document...
[1607] Transforming document...
[1608] Transforming document...
[1609] T

[1835] Transforming document...
[1836] Transforming document...
[1837] Transforming document...
[1838] Transforming document...
[1839] Transforming document...
[1840] Transforming document...
[1841] Transforming document...
[1842] Transforming document...
[1843] Transforming document...
[1844] Transforming document...
[1845] Transforming document...
[1846] Transforming document...
[1847] Transforming document...
[1848] Transforming document...
[1849] Transforming document...
[1850] Transforming document...
[1851] Transforming document...
[1852] Transforming document...
[1853] Transforming document...
[1854] Transforming document...
[1855] Transforming document...
[1856] Transforming document...
[1857] Transforming document...
[1858] Transforming document...
[1859] Transforming document...
[1860] Transforming document...
[1861] Transforming document...
[1862] Transforming document...
[1863] Transforming document...
[1864] Transforming document...
[1865] Transforming document...
[1866] T

[2092] Transforming document...
[2093] Transforming document...
[2094] Transforming document...
[2095] Transforming document...
[2096] Transforming document...
[2097] Transforming document...
[2098] Transforming document...
[2099] Transforming document...
[2100] Transforming document...
[2101] Transforming document...
[2102] Transforming document...
[2103] Transforming document...
[2104] Transforming document...
[2105] Transforming document...
[2106] Transforming document...
[2107] Transforming document...
[2108] Transforming document...
[2109] Transforming document...
[2110] Transforming document...
[2111] Transforming document...
[2112] Transforming document...
[2113] Transforming document...
[2114] Transforming document...
[2115] Transforming document...
[2116] Transforming document...
[2117] Transforming document...
[2118] Transforming document...
[2119] Transforming document...
[2120] Transforming document...
[2121] Transforming document...
[2122] Transforming document...
[2123] T

### Finally...

这里我们需要读取真正的Y值，格式为excel文件，且仅有两列，一列是ID（比如文档号码），一列是Y值。

$\textbf{注意！}$ 读取Y值的文件里，ID的对应顺序要和提供训练数据的文档ID一致！一个简单的办法就是把Y值先按ID添加到原数据中，再进行分割即可。

In [None]:
def load_Y(file_path, col_name_ID, col_name_Y):
    global data
    df = pd.read_excel(file_path)
    if list(df[col_name_ID]) == list(data[col_name_ID]):
        print('训练数据与Y值的文档ID成功匹配！')
        return np.array(list(df[col_name_Y]))
    else:
        print('警告！训练数据与Y值的文档ID不匹配，请检查！')
        return None

In [None]:
y_path = '请替换成储存Y值文件的路径' # e.g. dummy_Y.xlsx
doc_ID_name = '请替换文档ID的名称' # e.g. 文档号码
y_name = '请替换Y值的名称' 
y = load_Y(Y_path, doc_ID_name, Y_name)

如果之前已经拿到Y 就直接使用

In [239]:
result = lr_coeffs(X, Y)
# 储存结果
result.to_excel('word_ranking.xlsx')