In [None]:
import os
import math            
import joblib 
from copy import deepcopy
import numpy as np
import pandas as pd 
import seaborn as sbn
import matplotlib as mpl
from xgboost import plot_importance
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from multiprocessing import cpu_count
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline  

pd.set_option('display.max_rows', None)         # 列展示
np.set_printoptions(threshold=np.inf)           # np.inf表示正无穷    
sbn.set(style="whitegrid", color_codes=True)    # 设置绘图风格           
mpl.rcParams['font.family'] = ['sans-serif']
mpl.rcParams['font.sans-serif'] = ['SimHei']    # 正常显示中文标签     
mpl.rcParams['axes.unicode_minus'] = False
plt.rcParams['figure.figsize'] = (12.0, 10.0)   # 设置图形大小     
plt.rcParams['savefig.dpi'] = 500.              # 图片像素          
plt.rcParams['figure.dpi'] = 500.               # 分辨率  
# print(plt.rcParams.keys())                     # plt参数            

# 读取数据

In [None]:
df=pd.read_csv("./data/model_data.csv")         # 从已存入路径读取进来

In [None]:
model_data = deepcopy(df) 
print('model_data.shape: {}'.format(model_data.shape))  

In [None]:
analysis_cols = list(df.columns)
label_col = 'waybill_label'
analysis_cols.remove(label_col)

In [None]:
# 核验数据 (xgboost.core.XGBoostError: Check failed: valid: Input data contains `inf` or `nan`)
# 显式声明分类器对missing=np.nan进行处理
# print(np.isnan(model_data).any())
# print(np.isinf(model_data).any())
print(model_data.groupby(label_col)[label_col].count()) 
print(model_data.dtypes)

# toad评分

## EDA

In [None]:
import toad 
model_data = model_data.head(100000)     # 测试是否能够运行
X = model_data[analysis_cols]
y = model_data[label_col]

In [None]:
x_detect = toad.detect(X).reset_index()  
print(x_detect.shape)
print(x_detect.head(2))

In [None]:
# x_detect.to_csv('./output_data/x_detect.csv', index=False)   

In [None]:
quality_analysis = toad.quality(model_data, label_col, iv_only=False)
print(quality_analysis[:20])  

In [None]:
quality_analysis.to_csv('./output_data/quality_analysis.csv', index=True)    

## 特征筛选

In [None]:
train_selected, dropped = toad.selection.select(model_data, target=label_col, empty=0.8, iv=0.02, corr=0.7, return_drop=False, exclude=None)
print(dropped)
print(train_selected.shape)     

## 分箱

In [None]:
# 支持数值型数据和离散型分箱，默认分箱方法为卡方分箱
c = toad.transform.Combiner()
# 使用特征筛选后的数据进行训练：使用稳定的卡方分箱，规定每箱至少有5%数据, 空值将自动被归到最佳箱。
c.fit(model_data, y=label_col, method='chi', min_samples=0.05)    # empty_separate = False

In [None]:
print('down_sign_time:\n', c.export['down_sign_time'])

# LR模型

In [None]:
## 数据切分
X = model_data[analysis_cols]
y = model_data['waybill_label'] 
x_train, x_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=2023)        
print("x_train.shape: {}".format(x_train.shape))                                                        
print("x_test.shape: {}".format(x_test.shape))  