In [12]:
# %%
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os 

from sklearn import preprocessing
from sklearn import feature_selection
from sklearn import linear_model
from sklearn import model_selection
from sklearn import metrics

# from matplotlib.font_manager import FontProperties
sns.set(font='Microsoft Yahei')

In [4]:
# set data files
datasetConfig = {
  'train': {
    'base': 'base_train_sum.csv',
    'finance': 'money_report_train_sum.csv',
    'patent': 'patent_train_sum.csv',
    'report': 'year_report_train_sum.csv'
  },
  'validate': {
    'base': 'base_verify1.csv',
    'finance': 'money_information_verify1.csv',
    'patent': 'patent_information_verify1.csv',
    'report': 'year_report_verify1.csv'
  },
  'test': {
    'base': 'base_test_sum.csv',
    'finance': 'money_report_test_sum.csv',
    'patent': 'knowledge_test_sum.csv',
    'report': 'year_report_test_sum.csv'
  }
}
mode = 'validate'


In [5]:
def pairplot(df, flag, dots):
  df = df.set_index('ID')
  df.iloc[:, :] = scale(df.to_numpy())
  df = pd.merge(df, flag, left_index=True, right_on='ID').drop('ID', axis=1).sample(dots)
  g = sns.PairGrid(df, hue='flag')
  g.map_diag(sns.distplot)
  g.map_upper(plt.scatter)
  g.map_lower(sns.kdeplot, shade=True, shade_lowest=False)
  g.add_legend()



In [6]:
basedf, financedf, patentdf, reportdf = (pd.read_csv(os.path.join(
  'dataset', datasetConfig[mode][sub])) for sub in datasetConfig['train'].keys())
flagdf = basedf[['ID', 'flag']]
basedf = basedf.drop('flag', axis=1)
basedf0, financedf0, patentdf0, reportdf0 = basedf.copy(), financedf.copy(), patentdf.copy(), reportdf.copy()

In [7]:
# 检查basedf变量
basedf.isna().sum()

ID           0
注册时间       317
注册资本       323
行业         277
区域         285
企业类型       317
控制人类型      339
控制人ID      282
控制人持股比例    315
dtype: int64

In [8]:
basedfNumeric = basedf[['ID', '注册时间', '注册资本', '控制人持股比例']]
basedfNumeric = basedfNumeric.fillna(basedfNumeric.mean())
basedf[['行业', '区域', '企业类型', '控制人类型']] = basedf[['行业', '区域', '企业类型', '控制人类型']].fillna('None')
basedf = pd.concat([basedfNumeric,
                    pd.get_dummies(basedf.行业, prefix='行业'),
                    pd.get_dummies(basedf.区域, prefix='区域'),
                    pd.get_dummies(basedf.企业类型, prefix='企业类型'),
                    pd.get_dummies(basedf.控制人类型, prefix='控制人类型'), ], axis=1)

In [9]:
testdf = pd.merge(basedf, flagdf, on='ID').drop('ID', axis=1).dropna()
testdf.iloc[:, :-1] = preprocessing.minmax_scale(testdf.iloc[:, :-1])
# chi2 test
chi2, pv = feature_selection.chi2(testdf.iloc[:, :-1], testdf.flag)

In [10]:
sig = pd.DataFrame({'feature': testdf.columns[:-1], 'chi2': chi2, 'p': pv})
sig.sort_values('chi2', ascending=False)

Unnamed: 0,feature,chi2,p
4,行业_交通运输业,4.261965,0.038975
20,企业类型_合伙企业,3.228094,0.072385
21,企业类型_有限责任公司,2.146435,0.142901
15,区域_湖北,1.055135,0.304327
25,控制人类型_企业法人,0.944734,0.331063
10,区域_None,0.923369,0.336592
26,控制人类型_自然人,0.868,0.35151
17,区域_福建,0.685689,0.407635
8,行业_社区服务,0.597273,0.43962
7,行业_服务业,0.445198,0.504624
