In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import klib as kl
import os
import warnings

os.environ['KERAS_BACKEND']='tensorflow'
os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
warnings.filterwarnings('ignore')
plt.rcParams['font.sans-serif'] = 'Kaiti'
plt.rcParams['axes.unicode_minus'] = False
PIC_PATH = "../../models/image/image5"
DATA_PATH = '../../data'
RESULT_PATH = '../../data/summary'

In [2]:
test_data = pd.read_csv(DATA_PATH + '/long-customer-test.csv')
test_data

Unnamed: 0,CustomerId,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,15647311,608,1,41,1,83807.86,1,0,1,112542.58
1,15737452,653,0,58,1,132602.88,1,1,0,5097.67
2,15577657,732,0,41,8,0.00,2,1,1,170886.17
3,15589475,591,1,39,3,0.00,3,1,0,140469.38
4,15687946,556,1,61,2,117419.35,1,1,1,94153.83
...,...,...,...,...,...,...,...,...,...,...
995,15732202,615,0,34,1,83503.11,2,1,1,73124.53
996,15735078,724,1,53,1,139687.66,2,1,1,12913.92
997,15707861,520,1,46,10,85216.61,1,1,0,117369.52
998,15594612,702,0,44,9,0.00,1,0,0,59207.41


In [3]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CustomerId       1000 non-null   int64  
 1   CreditScore      1000 non-null   int64  
 2   Gender           1000 non-null   int64  
 3   Age              1000 non-null   int64  
 4   Tenure           1000 non-null   int64  
 5   Balance          1000 non-null   float64
 6   NumOfProducts    1000 non-null   int64  
 7   HasCrCard        1000 non-null   int64  
 8   IsActiveMember   1000 non-null   int64  
 9   EstimatedSalary  1000 non-null   float64
dtypes: float64(2), int64(8)
memory usage: 78.2 KB


In [4]:
test_data.rename(columns={
        'Tenure': 'Status',
        'Balance': 'AssetStage'
    }, inplace=True)
test_data

Unnamed: 0,CustomerId,CreditScore,Gender,Age,Status,AssetStage,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,15647311,608,1,41,1,83807.86,1,0,1,112542.58
1,15737452,653,0,58,1,132602.88,1,1,0,5097.67
2,15577657,732,0,41,8,0.00,2,1,1,170886.17
3,15589475,591,1,39,3,0.00,3,1,0,140469.38
4,15687946,556,1,61,2,117419.35,1,1,1,94153.83
...,...,...,...,...,...,...,...,...,...,...
995,15732202,615,0,34,1,83503.11,2,1,1,73124.53
996,15735078,724,1,53,1,139687.66,2,1,1,12913.92
997,15707861,520,1,46,10,85216.61,1,1,0,117369.52
998,15594612,702,0,44,9,0.00,1,0,0,59207.41


In [5]:
bin_tenure = [0, 3, 6, 11]
labels = ['新客户', '稳定客户', '老客户']
box1 = pd.cut(test_data['Status'], bins=bin_tenure, labels=labels, include_lowest=True, right=True)
test_data['Status'] = box1

bin_balance = [0, 50000, 90000, 120000, 260000]
labels = ['低资产', '中下资产', '中上资产', '高资产']
box2 = pd.cut(test_data['AssetStage'], bins=bin_balance, labels=labels, include_lowest=True, right=True)
test_data['AssetStage'] = box2

test_data

Unnamed: 0,CustomerId,CreditScore,Gender,Age,Status,AssetStage,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,15647311,608,1,41,新客户,中下资产,1,0,1,112542.58
1,15737452,653,0,58,新客户,高资产,1,1,0,5097.67
2,15577657,732,0,41,老客户,低资产,2,1,1,170886.17
3,15589475,591,1,39,新客户,低资产,3,1,0,140469.38
4,15687946,556,1,61,新客户,中上资产,1,1,1,94153.83
...,...,...,...,...,...,...,...,...,...,...
995,15732202,615,0,34,新客户,中下资产,2,1,1,73124.53
996,15735078,724,1,53,新客户,高资产,2,1,1,12913.92
997,15707861,520,1,46,老客户,中下资产,1,1,0,117369.52
998,15594612,702,0,44,老客户,低资产,1,0,0,59207.41


In [6]:
MAP = {
    'is_active': {
        'Status': {
            '新客户': 3,
            '稳定客户': 4,
            '老客户': 5,
        },
        'Asset': {
            '低资产': 6,
            '中下资产': 7,
            '中上资产': 8,
            '高资产': 9
        },
        'Card': {
            '低资产': 6,
            '中下资产': 7,
            '中上资产': 9,
            '高资产': 9
        },
    },
    'not_active': {
        'Status': {
            '新客户': 0,
            '稳定客户': 1,
            '老客户': 2,
        },
        'Asset': {
            '低资产': 0,
            '中下资产': 1,
            '中上资产': 2,
            '高资产': 3
        },
        'Card': {
            '低资产': 0,
            '中下资产': 2,
            '中上资产': 5,
            '高资产': 5
        },
    }
}

In [7]:
active_group = test_data.groupby('IsActiveMember')
test_data['IsActiveStatus'] = np.nan

In [8]:
test_data

Unnamed: 0,CustomerId,CreditScore,Gender,Age,Status,AssetStage,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,IsActiveStatus
0,15647311,608,1,41,新客户,中下资产,1,0,1,112542.58,
1,15737452,653,0,58,新客户,高资产,1,1,0,5097.67,
2,15577657,732,0,41,老客户,低资产,2,1,1,170886.17,
3,15589475,591,1,39,新客户,低资产,3,1,0,140469.38,
4,15687946,556,1,61,新客户,中上资产,1,1,1,94153.83,
...,...,...,...,...,...,...,...,...,...,...,...
995,15732202,615,0,34,新客户,中下资产,2,1,1,73124.53,
996,15735078,724,1,53,新客户,高资产,2,1,1,12913.92,
997,15707861,520,1,46,老客户,中下资产,1,1,0,117369.52,
998,15594612,702,0,44,老客户,低资产,1,0,0,59207.41,


In [9]:
active = test_data[test_data['IsActiveMember'] == 1]
# test_data['IsActiveStatus'].loc[active.index] = active['Status'].map(MAP['is_active']['Status'])
#
# test_data
active

Unnamed: 0,CustomerId,CreditScore,Gender,Age,Status,AssetStage,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,IsActiveStatus
0,15647311,608,1,41,新客户,中下资产,1,0,1,112542.58,
2,15577657,732,0,41,老客户,低资产,2,1,1,170886.17,
4,15687946,556,1,61,新客户,中上资产,1,1,1,94153.83,
7,15701164,506,1,34,稳定客户,中上资产,1,1,1,159235.29,
8,15738721,773,0,41,老客户,中上资产,1,0,1,64595.25,
...,...,...,...,...,...,...,...,...,...,...,...
990,15726179,757,1,43,稳定客户,高资产,2,1,1,3497.43,
994,15784042,624,0,55,老客户,中上资产,1,1,1,95022.02,
995,15732202,615,0,34,新客户,中下资产,2,1,1,73124.53,
996,15735078,724,1,53,新客户,高资产,2,1,1,12913.92,


In [15]:
active['IsActiveStatus'] = active['Status'].map(MAP['is_active']['Status'])
active['IsActiveAssetStage'] = active['AssetStage'].map(MAP['is_active']['Asset'])
active['CrCardAssetStage'] = active['AssetStage'].map(MAP['not_active']['Card'])
active

Unnamed: 0,CustomerId,CreditScore,Gender,Age,Status,AssetStage,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,IsActiveStatus,IsActiveAssetStage,CrCardAssetStage
0,15647311,608,1,41,新客户,中下资产,1,0,1,112542.58,3,7,2
2,15577657,732,0,41,老客户,低资产,2,1,1,170886.17,5,6,0
4,15687946,556,1,61,新客户,中上资产,1,1,1,94153.83,3,8,5
7,15701164,506,1,34,稳定客户,中上资产,1,1,1,159235.29,4,8,5
8,15738721,773,0,41,老客户,中上资产,1,0,1,64595.25,5,8,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
990,15726179,757,1,43,稳定客户,高资产,2,1,1,3497.43,4,9,5
994,15784042,624,0,55,老客户,中上资产,1,1,1,95022.02,5,8,5
995,15732202,615,0,34,新客户,中下资产,2,1,1,73124.53,3,7,2
996,15735078,724,1,53,新客户,高资产,2,1,1,12913.92,3,9,5


In [18]:
not_active = test_data[test_data['IsActiveMember'] == 0]
not_active['IsActiveStatus'] = not_active['Status'].map(MAP['is_active']['Status'])
not_active['IsActiveAssetStage'] = not_active['AssetStage'].map(MAP['not_active']['Asset'])
not_active['CrCardAssetStage'] = not_active['AssetStage'].map(MAP['not_active']['Card'])
# test_data['IsActiveStatus'].loc[active.index] = active['Status'].map(MAP['is_active']['Status'])
#
# test_data
active_ = pd.concat([active, not_active])
active_

Unnamed: 0,CustomerId,CreditScore,Gender,Age,Status,AssetStage,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,IsActiveStatus,IsActiveAssetStage,CrCardAssetStage
0,15647311,608,1,41,新客户,中下资产,1,0,1,112542.58,3,7,2
2,15577657,732,0,41,老客户,低资产,2,1,1,170886.17,5,6,0
4,15687946,556,1,61,新客户,中上资产,1,1,1,94153.83,3,8,5
7,15701164,506,1,34,稳定客户,中上资产,1,1,1,159235.29,4,8,5
8,15738721,773,0,41,老客户,中上资产,1,0,1,64595.25,5,8,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
991,15632446,667,0,24,稳定客户,低资产,2,0,0,180329.83,4,0,0
992,15669414,486,0,62,老客户,中上资产,2,1,0,168034.83,5,2,5
993,15746569,589,0,38,稳定客户,低资产,1,1,0,95483.48,4,0,0
997,15707861,520,1,46,老客户,中下资产,1,1,0,117369.52,5,1,2


In [19]:
active_.sort_index(inplace=True)
active_

Unnamed: 0,CustomerId,CreditScore,Gender,Age,Status,AssetStage,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,IsActiveStatus,IsActiveAssetStage,CrCardAssetStage
0,15647311,608,1,41,新客户,中下资产,1,0,1,112542.58,3,7,2
1,15737452,653,0,58,新客户,高资产,1,1,0,5097.67,3,3,5
2,15577657,732,0,41,老客户,低资产,2,1,1,170886.17,5,6,0
3,15589475,591,1,39,新客户,低资产,3,1,0,140469.38,3,0,0
4,15687946,556,1,61,新客户,中上资产,1,1,1,94153.83,3,8,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,15732202,615,0,34,新客户,中下资产,2,1,1,73124.53,3,7,2
996,15735078,724,1,53,新客户,高资产,2,1,1,12913.92,3,9,5
997,15707861,520,1,46,老客户,中下资产,1,1,0,117369.52,5,1,2
998,15594612,702,0,44,老客户,低资产,1,0,0,59207.41,5,0,0


In [25]:
import joblib

model = joblib.load('./bestmodel.model')
data = active_[['Age', 'AssetStage', 'NumOfProducts',  'EstimatedSalary', 'IsActiveStatus',
                'CrCardAssetStage']]

In [26]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   Age               1000 non-null   int64   
 1   AssetStage        1000 non-null   category
 2   NumOfProducts     1000 non-null   int64   
 3   EstimatedSalary   1000 non-null   float64 
 4   IsActiveStatus    1000 non-null   category
 5   CrCardAssetStage  1000 non-null   int64   
dtypes: category(2), float64(1), int64(3)
memory usage: 41.3 KB


In [27]:
model.predict(data)


ValueError: DataFrame.dtypes for data must be int, float, bool or category.  When
categorical type is supplied, DMatrix parameter `enable_categorical` must
be set to `True`. Invalid columns:AssetStage, IsActiveStatus