In [1]:
import os
import pandas as pd
from tqdm import tqdm

def process_excel_files(directory):
    """
    Process all .xlsx files in the given directory and its subdirectories.
    For each file, add a column with the file name (without extension) and combine all dataframes.

    Args:
    - directory (str): Path to the base directory containing .xlsx files.

    Returns:
    - pd.DataFrame: A concatenated dataframe with data from all .xlsx files.
    """
    dataframes = []
    
    # Traverse the directory structure
    for root, dirs, files in os.walk(directory):
        for file in tqdm(files):
            if file.endswith(".xlsx"):
                # Construct the full file path
                file_path = os.path.join(root, file)
                try:
                    # Read the Excel file into a dataframe
                    df = pd.read_excel(file_path)
                    # Add a new column with the file name (without extension)
                    # df['file_name'] = os.path.splitext(file)[0]
                    # Append the dataframe to the list
                    dataframes.append(df)
                except Exception as e:
                    print(f"Error reading {file_path}: {e}")
    
    # Concatenate all dataframes into a single dataframe
    if dataframes:
        return pd.concat(dataframes, ignore_index=True)
    else:
        return pd.DataFrame()  # Return an empty dataframe if no valid files are found

# Input the directory path
base_dir = '../2014-2022 Credit Bond Data - Financial Indicators Reduced'

# Process the Excel files
final_dataframe = process_excel_files(base_dir)
final_dataframe.to_csv("../processed_data/bond_data_prenormalized.csv", index=False)

100%|██████████| 1/1 [00:00<00:00, 20068.44it/s]
100%|██████████| 407/407 [01:22<00:00,  4.92it/s]
100%|██████████| 47/47 [00:15<00:00,  2.97it/s]
100%|██████████| 6108/6108 [38:24<00:00,  2.65it/s]  


In [1]:
import os
import pandas as pd
from tqdm import tqdm

final_dataframe = pd.read_csv("../processed_data/bond_data_prenormalized.csv")

# 复制原始 DataFrame
standardized_final_dataframe = final_dataframe.copy()

# 选择需要标准化的列
columns_to_standardize = standardized_final_dataframe.columns[1:51]

# 计算标准化 (z 分数)
standardized_final_dataframe[columns_to_standardize] = (
    standardized_final_dataframe[columns_to_standardize] - standardized_final_dataframe[columns_to_standardize].mean()
) / standardized_final_dataframe[columns_to_standardize].std()

# 查看标准化后的 DataFrame
standardized_final_dataframe.head()


Unnamed: 0,日期,中间价:美元兑人民币,Shibor:3月,制造业PMI,宏观经济景气指数:先行指数,PPI:当月同比,CPI:当月同比,GDP:不变价:当季同比,社会融资规模存量:期末同比,所属申万一级行业指数,...,同期国债利率,成交量,剩余期限,到期收益率,风险价差,所属申万行业,所属申万行业代码,发行人所属地区,发行人中文简称,债券历史评级
0,2018-12-26,0.682962,0.566665,-0.130325,0.20026,0.001434,0.154216,0.218391,-0.992112,-0.723354,...,0.085622,-0.061238,0.460654,-0.055949,-0.056527,综合,51,北京市,紫光集团,
1,2018-12-27,0.702298,0.604109,-0.130325,0.20026,0.001434,0.154216,0.218391,-0.992112,-0.740503,...,0.085622,-0.05849,0.458862,0.009992,0.009412,综合,51,北京市,紫光集团,
2,2018-12-28,0.598906,0.629072,-0.130325,0.20026,0.001434,0.154216,0.218391,-0.992112,-0.737079,...,0.085622,-0.061238,0.457069,0.00999,0.00941,综合,51,北京市,紫光集团,
3,2019-01-02,0.539713,0.541702,-0.413353,0.167965,-0.396145,-0.114548,0.17201,-1.023215,-0.739526,...,0.085622,-0.061238,0.448107,0.00998,0.009401,综合,51,北京市,紫光集团,
4,2019-01-03,0.598512,0.491777,-0.413353,0.167965,-0.396145,-0.114548,0.17201,-1.023215,-0.748315,...,0.085622,-0.061238,0.446315,0.009978,0.009399,综合,51,北京市,紫光集团,


In [4]:
# start_data = '2020-01-01'
# end_date = '2020-12-31'
start_date = '2013-01-01'
end_date = '2023-12-31'

standardized_final_dataframe["日期"] = pd.to_datetime(standardized_final_dataframe["日期"])
filtered_df = standardized_final_dataframe[(standardized_final_dataframe["日期"] >= start_date) & (standardized_final_dataframe["日期"] <= end_date)]
print(len(filtered_df))
filtered_df.head()


4774574


Unnamed: 0,日期,中间价:美元兑人民币,Shibor:3月,制造业PMI,宏观经济景气指数:先行指数,PPI:当月同比,CPI:当月同比,GDP:不变价:当季同比,社会融资规模存量:期末同比,所属申万一级行业指数,...,同期国债利率,成交量,剩余期限,到期收益率,风险价差,所属申万行业,所属申万行业代码,发行人所属地区,发行人中文简称,债券历史评级
0,2018-12-26,0.682962,0.566665,-0.130325,0.20026,0.001434,0.154216,0.218391,-0.992112,-0.723354,...,0.085622,-0.061238,0.460654,-0.055949,-0.056527,综合,51,北京市,紫光集团,
1,2018-12-27,0.702298,0.604109,-0.130325,0.20026,0.001434,0.154216,0.218391,-0.992112,-0.740503,...,0.085622,-0.05849,0.458862,0.009992,0.009412,综合,51,北京市,紫光集团,
2,2018-12-28,0.598906,0.629072,-0.130325,0.20026,0.001434,0.154216,0.218391,-0.992112,-0.737079,...,0.085622,-0.061238,0.457069,0.00999,0.00941,综合,51,北京市,紫光集团,
3,2019-01-02,0.539713,0.541702,-0.413353,0.167965,-0.396145,-0.114548,0.17201,-1.023215,-0.739526,...,0.085622,-0.061238,0.448107,0.00998,0.009401,综合,51,北京市,紫光集团,
4,2019-01-03,0.598512,0.491777,-0.413353,0.167965,-0.396145,-0.114548,0.17201,-1.023215,-0.748315,...,0.085622,-0.061238,0.446315,0.009978,0.009399,综合,51,北京市,紫光集团,


In [5]:
all_bond_company_df = pd.read_excel('../processed_data/all_bond_company_data.xlsx', sheet_name='all')
company_short_list = []
for index, row in all_bond_company_df.iterrows():
    company_short_list.append(row['发行人中文简称'])
company_short_set = set(company_short_list)

filtered_df = filtered_df[filtered_df['发行人中文简称'].isin(company_short_set)]
filtered_df.count()

日期               4765594
中间价:美元兑人民币       4765594
Shibor:3月        4765594
制造业PMI           4765594
宏观经济景气指数:先行指数    4765594
PPI:当月同比         4765594
CPI:当月同比         4765594
GDP:不变价:当季同比     4765594
社会融资规模存量:期末同比    4765594
所属申万一级行业指数       4765594
债券分类违约概率         4765594
区域违约概率           4765594
营业收入             4765594
营业成本             4765594
利润总额             4765594
流动资产             4765594
非流动资产            4765594
资产总计             4765594
流动负债             4765594
非流动负债            4765594
负债合计             4765594
股东权益合计           4765594
经营活动现金流          4765594
投资活动现金流          4765594
筹资活动现金流          4765594
总现金流             4765594
流动比率             4765594
速动比率             4765594
超速动比率            4765594
资产负债率(%)         4765594
产权比率(%)          4765594
有形净值债务率(%)       4765594
销售毛利率(%)         4765594
销售净利率(%)         4765594
资产净利率(%)         4765594
营业利润率(%)         4765594
平均净资产收益率(%)      4765594
营运周期(天)          4765594
存货周转率            4765594
应收账款周转率          4765594


In [6]:
import json

with open('../processed_data/company_sentiment.json', 'r', encoding='utf-8') as file:
    company_final_senti_dict = json.load(file)

print(len(company_final_senti_dict))
company_final_senti_dict['紫光集团']['2020-01-01']

5018


1.4771602810364173

In [7]:
def compute_sentiment(row, company_final_senti_dict=company_final_senti_dict):
    date = row["日期"]
    company_short = row["发行人中文简称"]
    idx_date = date - pd.Timedelta(days=1)
    idx_date = idx_date.strftime("%Y-%m-%d")
    return company_final_senti_dict[company_short][idx_date]

filtered_df['sentiment'] = filtered_df.apply(compute_sentiment, axis=1)



In [8]:
filtered_df.to_csv("../processed_data/bond_data_normalized_w_senti.csv", index=False)
filtered_df.head()

Unnamed: 0,日期,中间价:美元兑人民币,Shibor:3月,制造业PMI,宏观经济景气指数:先行指数,PPI:当月同比,CPI:当月同比,GDP:不变价:当季同比,社会融资规模存量:期末同比,所属申万一级行业指数,...,成交量,剩余期限,到期收益率,风险价差,所属申万行业,所属申万行业代码,发行人所属地区,发行人中文简称,债券历史评级,sentiment
0,2018-12-26,0.682962,0.566665,-0.130325,0.20026,0.001434,0.154216,0.218391,-0.992112,-0.723354,...,-0.061238,0.460654,-0.055949,-0.056527,综合,51,北京市,紫光集团,,1.563678
1,2018-12-27,0.702298,0.604109,-0.130325,0.20026,0.001434,0.154216,0.218391,-0.992112,-0.740503,...,-0.05849,0.458862,0.009992,0.009412,综合,51,北京市,紫光集团,,0.523973
2,2018-12-28,0.598906,0.629072,-0.130325,0.20026,0.001434,0.154216,0.218391,-0.992112,-0.737079,...,-0.061238,0.457069,0.00999,0.00941,综合,51,北京市,紫光集团,,0.548396
3,2019-01-02,0.539713,0.541702,-0.413353,0.167965,-0.396145,-0.114548,0.17201,-1.023215,-0.739526,...,-0.061238,0.448107,0.00998,0.009401,综合,51,北京市,紫光集团,,0.431055
4,2019-01-03,0.598512,0.491777,-0.413353,0.167965,-0.396145,-0.114548,0.17201,-1.023215,-0.748315,...,-0.061238,0.446315,0.009978,0.009399,综合,51,北京市,紫光集团,,1.551993


In [9]:
column_with_index = {index: column for index, column in enumerate(filtered_df.columns)}
print(column_with_index)

{0: '日期', 1: '中间价:美元兑人民币', 2: 'Shibor:3月', 3: '制造业PMI', 4: '宏观经济景气指数:先行指数', 5: 'PPI:当月同比', 6: 'CPI:当月同比', 7: 'GDP:不变价:当季同比', 8: '社会融资规模存量:期末同比', 9: '所属申万一级行业指数', 10: '债券分类违约概率', 11: '区域违约概率', 12: '营业收入', 13: '营业成本', 14: '利润总额', 15: '流动资产', 16: '非流动资产', 17: '资产总计', 18: '流动负债', 19: '非流动负债', 20: '负债合计', 21: '股东权益合计', 22: '经营活动现金流', 23: '投资活动现金流', 24: '筹资活动现金流', 25: '总现金流', 26: '流动比率', 27: '速动比率', 28: '超速动比率', 29: '资产负债率(%)', 30: '产权比率(%)', 31: '有形净值债务率(%)', 32: '销售毛利率(%)', 33: '销售净利率(%)', 34: '资产净利率(%)', 35: '营业利润率(%)', 36: '平均净资产收益率(%)', 37: '营运周期(天)', 38: '存货周转率', 39: '应收账款周转率', 40: '流动资产周转率', 41: '股东权益周转率', 42: '总资产周转率', 43: '授信剩余率', 44: '授信环比变动', 45: '担保授信比', 46: '同期国债利率', 47: '成交量', 48: '剩余期限', 49: '到期收益率', 50: '风险价差', 51: '所属申万行业', 52: '所属申万行业代码', 53: '发行人所属地区', 54: '发行人中文简称', 55: '债券历史评级', 56: 'sentiment'}
