In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import re

from modules.info import *
from modules.save2csv import *
from modules.judge_type import *  # 导入企业规模判断函数


In [2]:
# 加载已经提数的主体清单
entity = pd.read_csv(path + 'data/主体提数_230423.csv')

entity['NOTES_AOO_AuditOpinion_idou'] = np.nan  # TODO: 补充缺失的审计意见

# 加载主体分类清单
entity_A = pd.read_excel(path + 'rules/主体分类清单_230228/A股上市.xlsx')
entity_IPO = pd.read_excel(path + 'rules/主体分类清单_230228/IPO.xlsx')
entity_Bond = pd.read_excel(path + 'rules/主体分类清单_230228/债券存续主体.xlsx')
entity_toBond = pd.read_excel(path + 'rules/主体分类清单_230228/待发行债券主体.xlsx')
entity_cBond = pd.read_excel(path + 'rules/主体分类清单_230228/可转债.xlsx')

# 加载行业映射表
ind1 = pd.read_excel(path + 'rules/行业映射表/一级.xlsx')
ind2 = pd.read_excel(path + 'rules/行业映射表/二级.xlsx')
ind3 = pd.read_excel(path + 'rules/行业映射表/三级.xlsx')
ind4 = pd.read_excel(path + 'rules/行业映射表/四级.xlsx')

# 加载行业规模判断表
ind_rule = pd.read_excel(path + 'rules/行业规模判断_230418.xlsx')


In [3]:
# 开始计算标签
# 发行状态
conditions = [
    entity['证券代码'].isin(entity_IPO['证券代码']),
    entity['证券代码'].isin(entity_A['证券代码'])
]
choices = ['IPO', 'A股上市']
entity['ComFeature-AShare-Category'] = np.select(conditions, choices, default='其他')

# 上市板块
entity['ComFeature-AShare-Market'] = entity['上市板']
entity.loc[~entity['ComFeature-AShare-Market'].isin(['主板', '创业板', '科创板', '北证']), 'ComFeature-AShare-Market'] = np.nan

# 上市地点
entity['ComFeature-AShare-Local'] = entity['股票上市地点']
entity.loc[~entity['ComFeature-AShare-Local'].isin(['上海', '深圳', '北京']), 'ComFeature-AShare-Local'] = np.nan

# 境内企业
entity['ComFeature-ListInfo-Mainland'] = np.where(entity['统一社会信用代码'].str.startswith('91'), True, False)

# 营业收入快速增长企业
entity['Increase_rate-3Y'] = (entity['营业收入21']/entity['营业收入19'])**(1/3) - 1
entity['ComFeature-ListPerform-Increase'] = np.where(
    ((entity['营业收入21'] >= 5000000000) & (entity['Increase_rate-3Y']>0.1))|
    ((entity['营业收入21'] < 5000000000) & (entity['Increase_rate-3Y']>0.2)),
    True, False
)

# 具有表决权差异
entity['ComFeature-ListEquity-Diff'] = np.where(entity['是否存在投票权差异'] == 1, True, False)

# 尚未在境外上市的红筹企业
entity['ComFeature-ListInfo-RedChip'] = np.where(entity['ComFeature-ListInfo-Mainland'] == False, True, False)


In [4]:
# 拟发行债券/有存续债券
entity['ComFeature-BondIssue-Issuing'] = np.where(entity['统一社会信用代码'].isin(entity_Bond['统一社会信用代码']), True, False)
entity['ComFeature-BondIssue-ToIssue'] = np.where(entity['统一社会信用代码'].isin(entity_toBond['统一社会信用代码']), True, False)

# 短期融资券/企业债/公司债/中期票据
entity['ComFeature-BondIssue-ShortTerm'] = np.where(entity['Wind债券二级分类'] == '一般短期融资券', True, False)
entity['ComFeature-BondIssue-Enterprise'] = np.where(entity['Wind债券二级分类'] == '一般企业债', True, False)
entity['ComFeature-BondIssue-Company'] = np.where(entity['Wind债券二级分类'] == '一般公司债', True, False)
entity['ComFeature-BondIssue-Medium'] = np.where(entity['Wind债券二级分类'] == '一般中期票据', True, False)

In [5]:
# 是否金融类企业
# 若【Wind二级行业分类】为以下之一："银行"、"金融"、"多元金融"、"保险II"，或【申万一级行业分类】为以下之一："银行"、"非银金融"，则为TRUE；否则为FALSE
entity['申万一级行业分类'] = entity['申万一级行业分类'].str.replace('(2021)', '')

entity['ComFeature-BondInd-IfFin'] = np.where((entity['Wind二级行业分类'].isin(["银行", "金融", "多元金融", "保险II"]))|(entity['申万一级行业分类'].isin(["银行", "非银金融"])), True, False)

# 非金融企业债务融资工具
# 若发行人【金融类企业】为fALSE且【NAFMII债券一级分类】为：短期融资券／中期票据／超短期融资券／中小企业集合票据，则为TRUE
entity['ComFeature-BondIssue-NonFin'] = np.where((entity['NAFMII债券一级分类'].isin(['短期融资券', '中期票据', '超短期融资券', '中小企业集合票据']))|(entity['ComFeature-BondInd-IfFin'] == False), True, False)


  entity['申万一级行业分类'] = entity['申万一级行业分类'].str.replace('(2021)', '')


In [6]:
# 是否国有企业
# 若企业【企业性质】（PROFILE_type）为【中央国有企业】或【地方国有企业和集体企业】时为True，否则为False
entity['ComFeature-BondProp-IfState'] = np.where(entity['公司属性'].isin(["中央国有企业", "地方国有企业", "集体企业"]), True, False)


In [7]:
# 上市地点：上海/深圳/北京/银行间
entity['ComFeature-BondIssue-LocalSH'] = np.where(entity['债券上市地点']=='上海', True, False)
entity['ComFeature-BondIssue-LocalSZ'] = np.where(entity['债券上市地点']=='深圳', True, False)
entity['ComFeature-BondIssue-LocalBJ'] = np.where(entity['债券上市地点']=='北京', True, False)
entity['ComFeature-BondIssue-LocalBank'] = np.where(entity['债券上市地点']=='银行间', True, False)


In [8]:
# 科技创新公司债券
entity['ComFeature-BondInd-ITBond'] = np.where(entity['模型敞口']=='M001', True, False)

# 是否为短期公司债
entity['ComFeature-BondIssue-ShortTerm'] = np.where((entity['债券期限'] <= 1)&(entity['ComFeature-BondIssue-Company'] == True), True, False)


In [9]:
# 可转债  TODO: 数据可能不全
entity['ComFeature-BondIssue-Conv'] = np.where(entity['统一社会信用代码'].isin(entity_cBond['统一社会信用代码']), True, False)

In [10]:
# 企业性质：全国国有/地方国有/其他
conditions = [
    entity['公司属性'] == '全国国有企业',
    entity['公司属性'] == '地方国有企业'
]
choices = ['全国国有企业', '地方国有企业']
entity['ComFeature-GeneralEquity-Feature'] = np.select(conditions, choices, default='其他')

# 是否城投企业
entity['ComFeature-Ind-CountryInv'] = np.where((entity['Wind城投债判定'] == True)|(entity['YY城投债判定'] == True), True, False)

In [11]:
# 审计意见严重程度
conditions = [
    entity['NOTES_AOO_AuditOpinion_idou'] == '标准无保留意见',
    entity['NOTES_AOO_AuditOpinion_idou'] == '带强调事项段的无保留意见',
    entity['NOTES_AOO_AuditOpinion_idou'] == '保留意见',
    entity['NOTES_AOO_AuditOpinion_idou'] == '无法表示意见',
    entity['NOTES_AOO_AuditOpinion_idou'] == '否定意见'
]
choices = [1, 2, 3, 4, 5]

entity['ComFeature-GeneralPerform-Audit'] = np.select(conditions, choices, default=np.nan)

# 审计意见达标
entity['ComFeature-GeneralPerform-IfGoodAudit'] = np.where(
    (entity['NOTES_AOO_AuditOpinion_idou'] == '标准无保留意见')|
    (entity['NOTES_AOO_AuditOpinion_idou'] == '带强调事项段的无保留意见'),
    True, False)


In [12]:
# 行业标签【假设中央】
entity['公司属性'] = '中央国有企业'

entity_ind = entity.merge(ind4, on=['国民经济行业分类：四级', '公司属性'], how='left')
entity_ind = entity_ind.merge(ind3, on=['国民经济行业分类：三级', '公司属性'], how='left')
entity_ind = entity_ind.merge(ind2, on=['国民经济行业分类：二级', '公司属性'], how='left')
entity_ind = entity_ind.merge(ind1, on=['国民经济行业分类：一级', '公司属性'], how='left')

entity_ind.fillna(99999, inplace=True)
entity_ind['评价行业_中央'] = np.where(entity_ind['评价行业：四级'] != 99999, entity_ind['评价行业：四级'],
                                np.where(entity_ind['评价行业：三级'] != 99999, entity_ind['评价行业：三级'],
                                         np.where(entity_ind['评价行业：二级'] != 99999, entity_ind['评价行业：二级'],
                                                  np.where(entity_ind['评价行业：一级'] != 99999, entity_ind['评价行业：一级'], np.nan))))
entity_ind.replace(99999, np.nan, inplace=True)

In [13]:
# 行业标签【假设地方】
entity_ind.drop(columns=['评价行业：四级', '评价行业：三级', '评价行业：二级', '评价行业：一级'], inplace=True)

entity_ind['公司属性'] = '地方国有企业'

entity_ind = entity_ind.merge(ind4, on=['国民经济行业分类：四级', '公司属性'], how='left')
entity_ind = entity_ind.merge(ind3, on=['国民经济行业分类：三级', '公司属性'], how='left')
entity_ind = entity_ind.merge(ind2, on=['国民经济行业分类：二级', '公司属性'], how='left')
entity_ind = entity_ind.merge(ind1, on=['国民经济行业分类：一级', '公司属性'], how='left')

entity_ind.fillna(99999, inplace=True)
entity_ind['评价行业_地方'] = np.where(entity_ind['评价行业：四级'] != 99999, entity_ind['评价行业：四级'],
                                np.where(entity_ind['评价行业：三级'] != 99999, entity_ind['评价行业：三级'],
                                         np.where(entity_ind['评价行业：二级'] != 99999, entity_ind['评价行业：二级'],
                                                  np.where(entity_ind['评价行业：一级'] != 99999, entity_ind['评价行业：一级'], np.nan))))
entity_ind.replace(99999, np.nan, inplace=True)

In [14]:
# 规模判断
str_remove_list = [r'(', r')', 'X', 'Y', 'Z']

for str_remove in str_remove_list:
    ind_rule['指标名称'] = ind_rule['指标名称'].str.replace(str_remove, '')


def remove_big(string):
    p = re.compile(r'.≥')
    new_string = re.sub(p, '', string)
    return new_string

def remove_medium_and_small(string):
    p = re.compile(r'≤.*')
    new_string = re.sub(p, '', string)
    return new_string


ind_rule['大型'] = ind_rule['大型'].apply(lambda x: remove_big(x))
ind_rule['中型'] = ind_rule['中型'].apply(lambda x: remove_medium_and_small(x))
ind_rule['小型'] = ind_rule['小型'].apply(lambda x: remove_medium_and_small(x))
ind_rule.drop(columns={'微型'}, inplace=True)

  ind_rule['指标名称'] = ind_rule['指标名称'].str.replace(str_remove, '')


In [15]:
size_list = ['大型', '中型', '小型']

for i in size_list:
    ind_rule[i] = pd.to_numeric(ind_rule[i])

In [16]:
ind_rule['size_rule'] = ind_rule.apply(lambda row: list(row[['大型', '中型', '小型']]), axis=1)
ind_rule.drop(columns=['大型', '中型', '小型'], inplace=True)
ind_rule = pd.pivot(ind_rule, index='国民经济行业分类：一级', columns='指标名称', values='size_rule').reset_index()

ind_rule['size_rule'] = ind_rule.apply(lambda row: list(row[['员工总数', '营业收入', '资产总额']]), axis=1)
ind_rule.drop(columns=['员工总数', '营业收入', '资产总额'], inplace=True)

ind_rule_dict = ind_rule.set_index('国民经济行业分类：一级')['size_rule'].to_dict()

In [17]:
entity_ind['企业规模22'] = entity_ind[['国民经济行业分类：一级', '营业收入21', '员工总数21', '资产总计21']].apply(lambda x : judge_type(list(x), ind_rule_dict),axis=1)
entity_ind.drop(columns=['评价行业：四级', '评价行业：三级', '评价行业：二级', '评价行业：一级'], inplace=True)
entity_ind.drop_duplicates(subset=['统一社会信用代码'], inplace=True)

In [18]:
# 整理数据 TODO: 原数据修改后也需修改
entity_ind.drop(
    columns=[
        '国民经济行业分类：一级', 
        '国民经济行业分类：二级', 
        '国民经济行业分类：三级', 
        '国民经济行业分类：四级', 
        '资产总计21', 
        '员工总数21', 
        '营业收入21', 
        '资产总计20', 
        '员工总数20', 
        '营业收入20', 
        '资产总计19', 
        '员工总数19', 
        '营业收入19', 
        '资产总计18', 
        '员工总数18', 
        '营业收入18'
        ], 
    inplace=True)

# entity_ind.rename(columns={'评价行业':'ComFeature-GeneralInd-Ind',
#                            '企业规模22':'ComFeature-GeneralPerform-Scale'},
#                   inplace=True)

entity_ind.rename(columns={'评价行业_中央':'ComFeature-GeneralInd-Ind-1',
                           '评价行业_地方':'ComFeature-GeneralInd-Ind-2',
                           '企业规模22':'ComFeature-GeneralPerform-Scale'},
                  inplace=True)

In [19]:
entity22 = entity_ind.drop(columns=entity_ind.columns[3:18])
entity22 = entity22.drop(columns='Increase_rate-3Y')

In [20]:
entity22.head(2)

Unnamed: 0,证券代码,证券简称,统一社会信用代码,NOTES_AOO_AuditOpinion_idou,ComFeature-AShare-Category,ComFeature-AShare-Market,ComFeature-AShare-Local,ComFeature-ListInfo-Mainland,ComFeature-ListPerform-Increase,ComFeature-ListEquity-Diff,...,ComFeature-BondIssue-LocalBank,ComFeature-BondInd-ITBond,ComFeature-BondIssue-Conv,ComFeature-GeneralEquity-Feature,ComFeature-Ind-CountryInv,ComFeature-GeneralPerform-Audit,ComFeature-GeneralPerform-IfGoodAudit,ComFeature-GeneralInd-Ind-1,ComFeature-GeneralInd-Ind-2,ComFeature-GeneralPerform-Scale
0,000002.SZ,万科A,91440300192181490G,,A股上市,主板,深圳,True,False,False,...,False,False,False,其他,False,,False,房地产开发业,地方房地产业,大型
1,000004.SZ,ST国华,91440300192441969E,,A股上市,主板,深圳,True,False,False,...,False,False,False,其他,False,,False,化学药品制造业,地方医药工业,大型


In [21]:
save2csv(entity22, '1_带标签的主体')