# 研究意义和目的
探索现在活跃用户中的有流失风险的用户，使用机器学习的方法进行模型预测，提供给公司对用户流失行为的预判。

# 流失用户数据判定 churn
## 规则1
fixed_month > 3
version < 8.21.0

half: age_section == '90后' 
half: age_section == '80后' 

## 规则2 
fixed_month > 3
source_channel: except carpool 

- 数据处理[https://blog.csdn.net/erinapple/article/details/81174918]
- stacking[https://blog.csdn.net/song430/article/details/90232554]

In [3]:
#常用工具库
import re
import numpy as np
import pandas as pd
import matplotlib as mlp
import matplotlib.pyplot as plt
import time

#算法辅助 & 数据
import sklearn
from sklearn.model_selection import KFold, RepeatedKFold
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import mean_squared_error


#算法（单一学习器）
from sklearn.neighbors import KNeighborsClassifier as KNNC
from sklearn.neighbors import KNeighborsRegressor as KNNR
from sklearn.tree import DecisionTreeRegressor as DTR
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.linear_model import LinearRegression as LR
from sklearn.linear_model import LogisticRegression as LogiR
from sklearn.linear_model import BayesianRidge
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.ensemble import GradientBoostingRegressor as GBR
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb
import lightgbm as lgb

#融合模型
from sklearn.ensemble import StackingClassifier

In [7]:
import codecs

features = ["user_id","cid","device_type","fix_month","gender","current_role","last_lat","last_lon","hometown","profession","source_channel","work_city_id","home_city_id","work_lon","home_lat","work_lat","home_lon","age_section","age","version","model","label"]
print(len(features))
print(features)
label_file = "data/0403_churn_2.csv"
with codecs.open(label_file) as f:
    for line in f.readlines():
        data = line.strip().split("\t")
        print(data)
        break

# create a DataFrame from the dictionary
data = pd.read_csv(label_file, delimiter="\t", names=features).drop("cid", axis=1)

# set the column data types
dtype = {
    'user_id': 'int64',
    'device_type': 'int8',
    'fix_month': 'int8',
    'gender': 'int8',
    'current_role': 'int8',
    'last_lat': 'float32',
    'last_lon': 'float32',
    'hometown': 'category',
    'profession': 'category',
    'source_channel': 'category',
    'work_city_id': 'int8',
    'home_city_id': 'int8',
    'work_lon': 'float32',
    'home_lat': 'float32',
    'work_lat': 'float32',
    'home_lon': 'float32',
    'age_section': 'category',
    'age': 'int8',
    'version': 'category',
    'model': 'category',
    'label': 'int8'
}

# apply the column data types to the DataFrame
df = data.astype(dtype)
print(len(df))

selected_rows_0 = df[df['label'] == 0]
print(len(selected_rows_0))

selected_rows_1 = df[df['label'] == 1]
print(len(selected_rows_1))



22
['user_id', 'cid', 'device_type', 'fix_month', 'gender', 'current_role', 'last_lat', 'last_lon', 'hometown', 'profession', 'source_channel', 'work_city_id', 'home_city_id', 'work_lon', 'home_lat', 'work_lat', 'home_lon', 'age_section', 'age', 'version', 'model', 'label']
['23662216', 'fe0f10d1-2481-47a9-937d-e1e6a367e526', '2', '13', '1', '1', '0', '0', '', '总务部', 'carpool', '352', '352', '108.64134', '34.37028', '34.30623', '108.77005', '90后', '24', '8.23.0', 'Xiaomi Mi 10', '0']
27801
23183
4618


In [9]:
# split the data into training and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# display the sizes of the two subsets
print("Training set size: ", len(train_df))
print("Test set size: ", len(test_df))

train_df.head(2)

feature_columns = ["user_id","device_type","fix_month","gender","current_role","last_lat","last_lon","hometown","profession","source_channel","work_city_id","home_city_id","work_lon","home_lat","work_lat","home_lon","age_section","age","version","model"]
X_train = train_df[feature_columns]
y_train = train_df['label']

X_train.head(2)

X_test = test_df[feature_columns]
y_test = test_df['label']


Training set size:  22240
Test set size:  5561


In [None]:
##### xgb

xgb_params = {'eta': 0.005, 
              'max_depth': 10, 
              'subsample': 0.8, 
              'colsample_bytree': 0.8, 
              'objective': 'reg:squarederror', 
              'eval_metric': 'rmse', 
              'silent': True, 
              'nthread': 8}#xgb的参数，可以自己改

kf = KFold(n_splits=5, shuffle=True, random_state=1412)#5折交叉验证
oof_xgb = np.zeros(len(train_df))#用于存放训练集的预测
predictions_xgb = np.zeros(len(test_df))#用于存放测试集的预测

for fold_, (train_index, val_index) in enumerate(kf.split(X_train, y_train)):
    print(f'Fold {fold_+1}:')
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
    trn_data = xgb.DMatrix(X_train_fold, label=y_train_fold, enable_categorical=True)
    val_data = xgb.DMatrix(X_val_fold, label=y_val_fold, enable_categorical=True)

    watchlist = [(trn_data, 'train'), (val_data, 'valid_data')]
    clf = xgb.train(dtrain=trn_data, num_boost_round=20000, evals=watchlist, early_stopping_rounds=200, verbose_eval=100, params=xgb_params)#80%用于训练过程

    oof_xgb[val_index] = clf.predict(xgb.DMatrix(X_val_fold, enable_categorical=True), ntree_limit=clf.best_ntree_limit)#预测20%的验证集
    predictions_xgb += clf.predict(xgb.DMatrix(X_test, enable_categorical=True), ntree_limit=clf.best_ntree_limit) / kf.n_splits#预测测试集，并且取平均

print("CV score train: {:<8.8f}".format(mean_squared_error(oof_xgb, y_train)))




In [None]:
print("CV score test: {:<8.8f}".format(mean_squared_error(predictions_xgb, y_test)))

In [12]:
##### lgb
param = {'num_leaves': 120,
         'min_data_in_leaf': 30, 
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.01,
         "min_child_samples": 30,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
         "metric": 'mse',
         "lambda_l1": 0.1,
         "verbosity": -1}#模型参数，可以修改
kf = KFold(n_splits=5, shuffle=True, random_state=2018)#5折交叉验证
oof_lgb = np.zeros(len(train_df))#存放训练集的预测结果
predictions_lgb = np.zeros(len(test_df))#存放测试集的预测结果

for fold_, (train_index, val_index) in enumerate(kf.split(X_train, y_train)):
    print(f'Fold {fold_+1}:')
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    trn_data = lgb.Dataset(X_train_fold, y_train_fold)#80%的训练集用于训练
    val_data = lgb.Dataset(X_val_fold, y_val_fold)#20%的训练集做验证集

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=200, early_stopping_rounds = 100)#训练过程
    
    result = clf.predict(X_val_fold, num_iteration=clf.best_iteration)#对验证集得到预测结果
    print(result.shape)
    oof_lgb[val_index] = result
    predictions_lgb += clf.predict(X_test, num_iteration=clf.best_iteration) / kf.n_splits#对测试集5次取平均值

print("CV score train: {:<8.8f}".format(mean_squared_error(oof_lgb, y_train)))
print("CV score test: {:<8.8f}".format(mean_squared_error(predictions_lgb, y_test)))


Fold 1:
Training until validation scores don't improve for 100 rounds




[200]	training's l2: 0.0459139	valid_1's l2: 0.0713996
Early stopping, best iteration is:
[234]	training's l2: 0.0418333	valid_1's l2: 0.0707173
(4448,)
Fold 2:
Training until validation scores don't improve for 100 rounds




[200]	training's l2: 0.0459427	valid_1's l2: 0.070493
Early stopping, best iteration is:
[243]	training's l2: 0.0409975	valid_1's l2: 0.0697725
(4448,)
Fold 3:
Training until validation scores don't improve for 100 rounds




[200]	training's l2: 0.0463828	valid_1's l2: 0.0679861
Early stopping, best iteration is:
[262]	training's l2: 0.0395737	valid_1's l2: 0.0673181
(4448,)
Fold 4:




Training until validation scores don't improve for 100 rounds
[200]	training's l2: 0.0456038	valid_1's l2: 0.0709915
Early stopping, best iteration is:
[255]	training's l2: 0.0394415	valid_1's l2: 0.0699471
(4448,)
Fold 5:
Training until validation scores don't improve for 100 rounds




[200]	training's l2: 0.0462596	valid_1's l2: 0.0704751
Early stopping, best iteration is:
[293]	training's l2: 0.0367502	valid_1's l2: 0.0690607
(4448,)
CV score train: 0.06936316
CV score test: 0.06875750


In [None]:
# 将lgb和xgb的结果进行stacking（叠加）
train_stack = np.vstack([oof_lgb,oof_xgb]).transpose()#训练集2列特征
test_stack = np.vstack([predictions_lgb, predictions_xgb]).transpose()#测试集2列特征
#贝叶斯分类器也使用交叉验证的方法，5折，重复2次，主要是避免过拟合
folds_stack = RepeatedKFold(n_splits=5, n_repeats=2, random_state=2018)
oof_stack = np.zeros(train_stack.shape[0])#存放训练集中验证集的预测结果
predictions = np.zeros(test_stack.shape[0])#存放测试集的预测结果

#enumerate() 函数用于将一个可遍历的数据对象(如列表、元组或字符串)组合为一个索引序列，同时列出数据和数据下标，一般用在 for 循环当中。
for fold_, (trn_idx, val_idx) in enumerate(folds_stack.split(train_stack,y_train)):#target就是每一行样本的标签值
    print("fold {}".format(fold_))
    
    trn_data, trn_y = train_stack[trn_idx], y_train.iloc[trn_idx].values#划分训练集的80%
    val_data, val_y = train_stack[val_idx], y_train.iloc[val_idx].values#划分训练集的20%做验证集
    
    clf_3 = BayesianRidge()
    clf_3.fit(trn_data, trn_y)#贝叶斯训练过程，sklearn中的。
    
    oof_stack[val_idx] = clf_3.predict(val_data)#对验证集有一个预测，用于后面计算模型的偏差
    predictions += clf_3.predict(test_stack) / 10 #对测试集的预测，除以10是因为5折交叉验证重复了2次
    
mean_squared_error(y_train.values, oof_stack)#计算出模型在训练集上的均方误差
print("CV score: {:<8.8f}".format(mean_squared_error(y_train.values, oof_stack)))


In [4]:


label_file = "data/0403_churn_2.csv"
# create a DataFrame from the dictionary
data = pd.read_csv(label_file, delimiter="\t", names=["user_id","cid","device_type","fix_month","gender","current_role","last_lat","last_lon","hometown","profession","source_channel","work_city_id","home_city_id","work_lon","home_lat","work_lat","home_lon","age_section","age","version","model","label"]).drop("cid", axis=1)

# one-hot encode the categorical features
cat_features = ['hometown', 'profession', 'source_channel', 'age_section', 'version', 'model']
df = pd.get_dummies(data, columns=cat_features)

# set the column data types
dtype = {
    'user_id': 'int64',
    'device_type': 'int8',
    'fix_month': 'int8',
    'gender': 'int8',
    'current_role': 'int8',
    'last_lat': 'float32',
    'last_lon': 'float32',
    'work_city_id': 'int32',
    'home_city_id': 'int32',
    'work_lon': 'float32',
    'home_lat': 'float32',
    'work_lat': 'float32',
    'home_lon': 'float32',
    'age': 'int8',
    'label': 'int8'
}

# apply the column data types to the DataFrame
df = df.astype(dtype)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
X_train = train_df.drop('label', axis=1)
y_train = train_df['label']
X_test = test_df.drop('label', axis=1)
y_test = test_df['label']
data_message = 'X_train.shape={}, X_test.shape={}'.format(X_train.shape, X_test.shape)

In [6]:
print(df.shape)
print(len(df))

(27801, 7729)
27801


In [38]:
label_file = "data/0403_churn_2.csv"
data = pd.read_csv(label_file, delimiter="\t", names=["user_id","cid","device_type","fix_month","gender","current_role","last_lat","last_lon","hometown","profession","source_channel","work_city_id","home_city_id","work_lon","home_lat","work_lat","home_lon","age_section","age","version","model","label"]).drop("cid", axis=1)

# set the column data types
dtype = {
    'user_id': 'int64',
    'device_type': 'int8',
    'fix_month': 'int8',
    'gender': 'int8',
    'current_role': 'int8',
    'last_lat': 'float32',
    'last_lon': 'float32',
    'hometown': 'category',
    'profession': 'category',
    'source_channel': 'category',
    'work_city_id': 'int8',
    'home_city_id': 'int8',
    'work_lon': 'float32',
    'home_lat': 'float32',
    'work_lat': 'float32',
    'home_lon': 'float32',
    'age_section': 'category',
    'age': 'int8',
    'version': 'category',
    'model': 'category',
    'label': 'int8'
}

# apply the column data types to the DataFrame
df = data.astype(dtype)
df['hometown'] = df['hometown'].cat.add_categories('NA')
df['hometown'] = df['hometown'].fillna('NA')

if type(df) == pd.DataFrame:
    df = df.values
print(type(df))

<class 'numpy.ndarray'>


In [None]:
import numpy as np 
import catboost as cgb

params = {
            'learning_rate': 0.05,
            'eval_metric': 'AUC',
            'depth': 8,
            'logging_level': 'Info',
            'loss_function': 'Logloss',
            'train_dir': 'model/cgb_record/',
            'thread_count': 6
        }
max_round = 1
cv_folds = 2
seed = 3
save_model_path = 'model/cgb.model'

dtrain = cgb.Pool(X_train, label=y_train)
cv_result = cgb.cv(dtrain, params, num_boost_round=max_round, nfold=cv_folds, seed=seed, logging_level='Verbose')

print(cv_result.keys())
for key, value in cv_result.items():
    print(key, value)

auc_test_mean = cv_result['test-AUC-mean']
best_round = np.argmax(auc_test_mean)
best_auc = np.max(auc_test_mean)  # 最好的 auc 值
