In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import klib as kl
import missingno as mns
import os
import warnings
import tqdm
import numba


os.environ['KERAS_BACKEND']='tensorflow'
os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
warnings.filterwarnings('ignore')
plt.rcParams['font.sans-serif'] = ['Kaiti']
plt.rcParams['axes.unicode_minus'] = False
PIC_PATH = "../../models/image/image1"
DATA_PATH = '../../data'
RESULT_PATH = '../../data/summary/'
MODEL_PATH = '../../models/model1'

In [None]:
import pathlib2 as pl2
import shutil

def creat_dir():
    pic_path = pl2.Path(PIC_PATH)
    if os.path.exists(PIC_PATH):
        shutil.rmtree(PIC_PATH)
    pic_path.mkdir(parents=True, exist_ok=True)
    if not os.path.exists(RESULT_PATH):
        os.mkdir(RESULT_PATH)
    if not os.path.exists(MODEL_PATH):
        os.mkdir(MODEL_PATH)

creat_dir()

In [None]:
figure_count = 0

def create_figure(figure_name, dpi=800):
    global figure_count
    figure_count += 1
    plt.savefig(PIC_PATH + f'/figure{figure_count}_{figure_name}.png', dpi=dpi)

In [None]:
from joblib import dump, load

def save_model(model, model_name: str) -> None:
    dump(model, MODEL_PATH + model_name)

def load_model(model_name: str):
    return load(MODEL_PATH + model_name)

In [None]:
def read_excel(base_path: str=DATA_PATH, file_name: str=None, index_col: int=0):
    return pd.read_excel(base_path + file_name, index_col=index_col)

def save_excel(data: pd.DataFrame, base_path=RESULT_PATH, file_name: str=None, index=True):
    data.to_csv(base_path+file_name, index=index)

In [None]:
sound = read_excel(file_name='/附件1语音业务用户满意度数据.xlsx')
test = read_excel(file_name='/附件3语音业务用户满意度预测数据.xlsx')

sound

In [None]:
col = [i for i in sound.columns if i not in test.columns][4:]
[col.remove(i)  for i in ['家宽投诉', '资费投诉']]
sound.drop(col, axis= 1, inplace=True)
col

In [None]:
col = [i for i in test.columns if i not in sound.columns][1:]
test.drop(col, axis=1, inplace=True)
col

In [None]:
kl.missingval_plot(sound)
create_figure('sound_missing_plot', dpi=800)

In [None]:
mns.heatmap(sound)
create_figure('sound_missing_heatmap')

In [None]:
sound_columns = sound.columns
sound_columns

In [None]:
sound_score = sound_columns[: 4]
sound_dummies = sound_columns[5: 22].drop('用户描述').drop('用户描述.1')
sound_dummies

In [None]:
place = sound_dummies[:7]
question = sound_dummies[8:-1]
question

In [None]:
sound[['家宽投诉', '资费投诉']].sum(axis=1)

In [None]:
def complain(data: pd.DataFrame):
    data['是否投诉'] = data[['家宽投诉', '资费投诉']].sum(axis=1).apply(lambda x: min(1, x))
    data.drop(['家宽投诉', '资费投诉'], axis=1, inplace=True)
    return data

complain(sound)

In [None]:
def encoder(data):
    return data if data == -1 else 1

sound[sound_dummies] = sound[sound_dummies].fillna(-1).applymap(encoder)
sound

In [None]:
import toad

missing_rate: pd.DataFrame = toad.detect(sound)[['missing']].applymap(lambda x: x[:-1]).astype(np.float64)
missing_rate = missing_rate.query('missing > 0')
missing_rate

In [None]:
def drop_treat(data: pd.DataFrame):
    sound_bad_data = data.query(
        '`用户描述.1` == "没有"'
    )
    sound_bad_data = sound_bad_data[sound_bad_data['其他，请注明.1'] == 1]
    data.drop(sound_bad_data.index, inplace=True)
    return sound_bad_data

sound_bad_data = drop_treat(sound)

In [None]:
sound['其他，请注明'].value_counts()

In [None]:
sound_bad_data = sound[sound['其他，请注明'] == 1]
sound_bad_data

In [None]:
def replace_place(data: pd.DataFrame, bad_data:pd.DataFrame, col_name: str) -> pd.DataFrame:
    """
    对特别注明的信号不好的地区进行矫正
    :param data: 原始数据
    :param bad_data: 需要矫正的数据
    :param col_name: 需要矫正的列
    :return: 替换后的原始数据
    """

    replace_value = ['地下', '电梯', '<NA>', '道路', '山区', '医院', '信号', np.nan, '无']

    replace1 = bad_data.query('用户描述.str.contains("地下|地铁|地库|车库")')
    replace1[col_name] = '地下'

    replace2 = bad_data.query('用户描述.str.contains("电梯")')
    replace2[col_name] = '电梯'

    replace3 = bad_data.query('用户描述.str.contains("家|小区|公寓|住|屋")')
    replace3[col_name] = '<NA>'
    replace3['居民小区'] = 1

    replace4 = bad_data.query('用户描述.str.contains("村|乡")')
    replace4[col_name] = '<NA>'
    replace4['农村'] = 1

    replace5 = bad_data.query('用户描述.str.contains("路|环|途|车|街")')
    replace5[col_name] = '道路'

    replace6 = bad_data.query('用户描述.str.contains("山")')
    replace6[col_name] = '山区'

    replace7 = bad_data.query('用户描述.str.contains("医院")')
    replace7[col_name] = '医院'

    replace8 = bad_data.query('用户描述.str.contains("信号|网络|中断|接|打")')
    replace8[col_name] = '信号'

    data.update(replace1)
    data.update(replace2)
    data.update(replace3)
    data.update(replace4)
    data.update(replace5)
    data.update(replace6)
    data.update(replace7)
    data.update(replace8)

    replace9 = bad_data.query('not 用户描述 in @replace_value')
    replace9[col_name] = '其他'
    data.update(replace9)
    return data

sound_new = replace_place(sound, sound_bad_data, '用户描述')
sound_new

In [None]:
"""
独热编码及后续相关处理
"""

def get_dummies(data: pd.DataFrame, process_col='用户描述'):
    data[process_col].fillna('<NA>', inplace=True)
    data[process_col] = data[process_col].astype(str)
    data[process_col] = data[process_col].str.replace('无', '<NA>')
    return pd.get_dummies(data, columns=[process_col])


def dummies_process(data: pd.DataFrame, process_col='用户描述', about_col='其他，请注明', save_name=None) -> pd.DataFrame:
    department = get_dummies(data, process_col)
    department.drop([about_col], axis=1, inplace=True)
    dummies_columns = [i for i in department.columns if '_' in i]
    drop_columns = [i for i in dummies_columns if 'NA' in i]
    [dummies_columns.remove(col) for col in drop_columns]
    department = department.drop(drop_columns, axis=1)
    department[dummies_columns] = department[dummies_columns].applymap(lambda x: x if x == 1 else -1)
    columns = pd.Series(department.columns)
    process_columns = columns.apply(lambda x: x.split('_')[1] if len(x.split('_'))>1 else x)

    if process_columns.duplicated().any():
        index = process_columns[process_columns.duplicated()].index
        process_columns[index] = process_columns[index] + '.1'
    department.columns = process_columns
    if save_name:
        save_excel(department, RESULT_PATH, save_name, True)
    return department

In [None]:
dummies = dummies_process(sound_new)
dummies

In [None]:
sound_bad_data = sound[sound['其他，请注明.1'] == 1]

def replace_question(data: pd.DataFrame, bad_data: pd.DataFrame, col: str='用户描述.1'):

    replace_value = question.copy().tolist()
    replace_value.append('<NA>')

    replace1 = bad_data.query('`用户描述.1`.str.contains("信号|2G|3G|4G|5G|基站")')
    replace1[col] = '<NA>'
    replace1[question[0]] = 1
    replace2 = bad_data.query('`用户描述.1`.str.contains("通话|拨打|接通")')
    replace2[col] = '<NA>'
    replace2[question[1]] = 1
    replace3 = bad_data.query('`用户描述.1`.str.contains("断")')
    replace3[col] = '<NA>'
    replace3[question[2]] = 1
    replace4 = bad_data.query('`用户描述.1`.str.contains("杂|清|断续")')
    replace4[col] = '<NA>'
    replace4[question[3]] = 1

    data.update(replace1)
    data.update(replace2)
    data.update(replace3)
    data.update(replace4)

    replace9 = bad_data.query('not `用户描述.1` in @replace_value')
    replace9[col] = '其他'
    data.update(replace9)
    return data

dummies_new = replace_question(dummies, sound_bad_data)
dummies_new = dummies_process(dummies_new, '用户描述.1', '其他，请注明.1', 'one_hot.csv')
dummies_new

In [None]:
def boolean_replace(x):
    if x not in ['是', '否']:
        return x
    return 1 if x == '是' else -1

def missing_clean(data: pd.DataFrame):
    # data[['重定向次数', '重定向驻留时长']] = data[['重定向次数', '重定向驻留时长']].fillna(0,)
    data['4\\5G用户'] = data['4\\5G用户'].apply(lambda x: x[0])
    data['是否关怀用户'] = data['是否关怀用户'].fillna('否',)
    data = data.applymap(boolean_replace)
    return data


In [None]:
def sample_clean(data: pd.DataFrame):
    data = missing_clean(data)
    # 7 samples
    data.dropna(axis=0, inplace=True)
    # data = pd.get_dummies(data, columns=['语音方式'])

    return data

dummies_new = sample_clean(dummies_new)
dummies_new

In [None]:
shell = dummies_new['终端品牌'].unique()
shell_type = dummies_new['终端品牌类型'].unique()

In [None]:
def data_clean(data: pd.DataFrame):
    data['终端品牌'] = data['终端品牌'].apply(lambda x: '其他' if x not in shell else x)
    data['终端品牌类型'] = data['终端品牌类型'].apply(lambda x: '其他' if x not in shell_type else x)
    _ = data['终端品牌类型'].apply(lambda x: ' '.join(x.split(' ')[:2]) if isinstance(x, str) else '其他').apply(
        lambda x: x.split('-')[0]
    ).apply(
        lambda x: x.split('_')[0]
    )
    data['终端品牌类型'] = _.apply(check)

    return data['终端品牌类型'].unique()

def check(string: str):
    import re
    if ' ' in string:
        return string.split(' ')[0]
    if re.match(r'^\d', string):
        return 'Num'
    re_str = r'A\d{4}'
    if re.match(re_str, string, re.I):
        return 'A_Num'
    if re.match(r'\D+', string, re.I):
        return 'word'
    if re.match(r'^HM', string, re.I):
        return 'HM'
    if re.match(r'^RMX\d+', string):
        return 'type_rmx'
    if re.match(r'^M.+[A-Z]$', string):
        return 'type_m'
    if re.match(r'^P.+0$', string):
        return 'type_p'
    if re.match(r'V\d+[A-Za-z]', string):
        return 'type_v'
    if re.match(r'^[A-Za-z].+\d+', string):
        return 'num_object'
    return '其他'

data_clean(dummies_new)

In [None]:
from sklearn.preprocessing import LabelEncoder

ll1 = LabelEncoder().fit(dummies_new['终端品牌'].astype(str))
dummies_new['终端品牌'] = ll1.transform(dummies_new['终端品牌'].astype(str))
save_model(ll1, '/LabelEncoder_1.model')

ll2 = LabelEncoder().fit(dummies_new['终端品牌类型'].astype(str))
dummies_new['终端品牌类型'] = ll2.transform(dummies_new['终端品牌类型'].astype(str))
save_model(ll2, '/LabelEncoder_2.model')

def star_map(data: pd.DataFrame):
    dct = {
            '未评级': -1,
            '准星': 0,
            '一星': 1,
            '二星': 2,
            '三星': 3,
            '银卡': 4,
            '金卡': 5,
            '白金卡': 6,
            '钻石卡': 7
        }
    data['客户星级标识'] = data['客户星级标识'].apply(lambda x: dct.get(x))
    return data

dummies = star_map(dummies_new)
dummies

In [None]:
float_feature = ['套外流量（MB）', '套外流量费（元）', '外省语音占比', '语音通话-时长（分钟）', '省际漫游-时长（分钟）',
                 '当月ARPU', '当月MOU', '前3月ARPU', '前3月MOU', '外省流量占比', 'GPRS总流量（KB）', 'GPRS-国内漫游-流量（KB）',]
class_feature = dummies.columns.drop(float_feature)

dummies_n = dummies.copy()
dummies_n[float_feature] = dummies[float_feature].astype(np.float64)
dummies_n[class_feature] = dummies[class_feature].astype(np.int64)

dummies_n

In [None]:
place_ = place.tolist()
place_.extend(['其他', '医院', '地下', '山区', '电梯', '道路'])
question_ = question.tolist()
question_.append('其他.1')
question_

In [None]:
def get_count(data:pd.DataFrame, drop: list, new_col: str=None):
    series = (data[drop]==1).astype(int).sum(axis=1)
    data.drop(drop, axis=1, inplace=True)
    if new_col is None:
        return series
    data[new_col] = series
    return data


dummies_n['出现问题地点数'] = get_count(dummies_n, place_)
dummies_n['出现问题数'] = get_count(dummies_n, question_)
dummies_n

In [None]:
target = dummies_n.iloc[:, :4]
target

In [None]:
train = dummies_n.iloc[:, 4:]
train

In [None]:
train.dtypes

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost.sklearn import XGBClassifier
from lightgbm.sklearn import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import  classification_report, f1_score, precision_score, recall_score

In [None]:
from imblearn.over_sampling import SMOTE

def split_data(train_data, predict):

    return train_test_split(train_data, predict, test_size=.3, random_state=42)

def train_model(train_data, predict):
    X_train, X_test, y_train, y_test = split_data(train_data, predict)
    models = [DecisionTreeClassifier(), KNeighborsClassifier(), RandomForestClassifier(),
              GradientBoostingClassifier(), XGBClassifier(), LGBMClassifier()]
    scores = pd.DataFrame()
    y_test = y_test.astype(np.int64)

    for model in models:
        print(model.__class__.__name__)
        reports = ''
        acc = []
        f1 = []
        precision = []
        recall = []
        for col in y_test.columns:
            print(col)
            try:
                model.fit(X_train, y_train[col])
                pre = model.predict(X_test)
            except ValueError:
                ll = LabelEncoder().fit(y_train[col])
                model.fit(X_train, ll.transform(y_train[col]))
                pre = ll.inverse_transform(model.predict(X_test))
            acc.append(model.score(X_test, y_test[col]))
            f1.append(f1_score(y_test[col], pre, average='micro'))
            precision.append(precision_score(y_test[col], pre, average='weighted'))
            recall.append(recall_score(y_test[col], pre, average='weighted'))
            report = classification_report(y_test[col], pre)
            reports += report
        with open(RESULT_PATH + model.__class__.__name__ + '.txt', 'w') as f:
            f.write(reports)


        scores[f'{model.__class__.__name__}'] = [np.mean(np.array(acc)), np.mean(f1), np.mean(precision), np.mean(recall)]
        # score[f'{model.__class__.__name__}'] = np.mean(f1)
        # score[f'{model.__class__.__name__}'] = np.mean(precision)
        # score[f'{model.__class__.__name__}'] = np.mean(recall)

    return scores

score = train_model(train, target)

In [None]:
score

In [None]:
for i in target.columns:
    plt.figure(figsize=(8, 8))
    counts = target[i].value_counts()
    plt.pie(counts, labels=counts.index, autopct="%1.1f%%")
    create_figure(f'{i}_describe')

In [None]:
from dataprep.eda.create_report import create_report

create_report(dummies_n).show_browser()

In [None]:
from autoviz.AutoViz_Class import AutoViz_Class

av = AutoViz_Class()
av.AutoViz(filename=None, depVar="语音通话整体满意度", dfte=dummies_n, verbose=0, chart_format='png', save_plot_dir=PIC_PATH)

In [None]:
corr = dummies_n.corr(method='kendall')
plt.rcParams['axes.unicode_minus'] = False
plt.figure(figsize=(18, 18))
mask = np.zeros_like(corr[corr.abs()>=.7],dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
plt.rcParams['font.sans-serif'] = 'Kaiti'
sns.heatmap(corr[corr>=.7],annot=True,mask=mask,cbar=True, linewidths=.5)

In [None]:
kl.corr_plot(dummies_n, target='语音通话整体满意度', method='kendall')

In [None]:
from sklearn.metrics import accuracy_score, f1_score
from imblearn.over_sampling import SVMSMOTE
from sklearn.impute import KNNImputer

def get_data(data: pd.DataFrame):
    temp = pd.DataFrame()
    for i in range(1, 11):
        query = data.query('语音通话整体满意度 == @i')
        if query.__len__() < 60:
            sample = 30
        elif 60 <= query.__len__() <120:
            sample = 60
        else:
            sample = 100
        temp = pd.concat([temp, data.sample(sample)], axis=0)
    return temp

def oversampling(data: pd.DataFrame):
    value_counts = data.iloc[:, 0].value_counts()
    _index = value_counts[value_counts<value_counts.max()].index.tolist()
    sm = SVMSMOTE()
    # print(data.iloc[:, 4:].shape, data[['语音通话整体满意度']].iloc[:, 0].shape)
    X, y = sm.fit_resample(data.iloc[:, 4:], data[['语音通话整体满意度']].values.reshape((-1, 1)))
    frame = data.iloc[:, :4]
    y = pd.DataFrame(y[frame.shape[0]:], columns=[data.columns[0]])
    frame = pd.concat([frame, y], axis=0)
    frame = pd.DataFrame(KNNImputer(n_neighbors=4).fit_transform(frame).astype(np.int64), columns=frame.columns)
    return pd.concat([frame, X], axis=1)

def split_train_test(init_data):
    _train = init_data.iloc[:, 4:]
    _target = init_data.iloc[:, :4]
    return _target, _train

def false_target(data: pd.DataFrame, _model=RandomForestClassifier()):
    init_data = pd.DataFrame()
    _best_score = 0
    for i in range(40):
        if i == 0:
            init_data = oversampling(get_data(data))
        init_data.drop_duplicates(inplace=True)
        _target, _train = split_train_test(init_data)
        _model.fit(_train, _target)

        new_data =get_data(data)
        _test_target, _test_train = split_train_test(new_data)

        _pre = pd.Series(_model.predict(_test_train)[:, 0])
        _score = accuracy_score(_pre, _test_target.iloc[:, 0])
        _f1 = f1_score(_pre, _test_target.iloc[:, 0], average='weighted')
        _best_score = max(_best_score, _f1)
        print(_score, _f1)

        _proba_max = pd.DataFrame(_model.predict_proba(_test_train)[0], index=_test_train.index).max(axis=1)
        _proba_max = _proba_max[_proba_max > .5]


        _pre.index = _test_target.index
        _pre = _pre[_pre==_test_target.iloc[:, 0]]
        _index = [i for i in _pre.index if i in _proba_max.index]
        init_data = pd.concat([init_data, new_data.loc[_index]], axis=0)

        if _f1 > .6 and _best_score - _f1 < .001:
            save_model(_model, '/best_model.model')
    return _model, _best_score

In [None]:
rf, best_score = false_target(dummies_n, RandomForestClassifier())
best_score

In [None]:
best_model: RandomForestClassifier = load_model('/best_model.model')
accuracy_score(best_model.predict(train)[:, 0], target.语音通话整体满意度)

In [None]:
from sklearn.metrics import confusion_matrix

matrix = confusion_matrix(target.语音通话整体满意度, best_model.predict(train)[:, 0])
matrix

In [None]:
def transform(_dummies):
    model_1: LabelEncoder = load_model('/LabelEncoder_1.model')
    model_2: LabelEncoder = load_model('/LabelEncoder_2.model')
    _dummies['终端品牌'] = _dummies['终端品牌'].apply(lambda x: x if x in model_1.classes_ else '其他')
    _dummies['终端品牌'] = model_1.transform(_dummies['终端品牌'].astype(str))
    _dummies['终端品牌'] = _dummies['终端品牌'].apply(lambda x: x if x in model_1.classes_ else 0)
    _dummies['终端品牌类型'] = model_2.transform(_dummies['终端品牌类型'].astype(str))

def invert_type(data):
    data[[i for i in data.columns if i in float_feature]] = data[[i for i in data.columns if i in float_feature]].astype(np.float64)
    data[[i for i in data.columns if i in class_feature]] = data[[i for i in data.columns if i in class_feature]].astype(np.int64)

def create_feature(data):
    data['出现问题地点数'] = get_count(data, [i for i in data.columns if i in place_])
    data['出现问题数'] = get_count(data, [i for i in data.columns if i in question_])

def replace_all(data):
    bad_data = data[data['其他，请注明'] == 1]
    data_new = replace_place(data, bad_data, '用户描述')
    _dummies = dummies_process(data_new)
    bad_data = _dummies[_dummies['其他，请注明.1'] == 1]
    _dummies = replace_question(_dummies, bad_data)
    _dummies = dummies_process(_dummies, '用户描述.1', '其他，请注明.1', 'one_hot_test.csv')
    return _dummies

def pipeline(data: pd.DataFrame):
    data[sound_dummies] = data[sound_dummies].fillna(-1).applymap(encoder)
    drop_treat(data)
    _dummies = replace_all(data)
    _dummies = sample_clean(_dummies)
    data_clean(_dummies)
    transform(_dummies)
    star_map(_dummies)
    invert_type(_dummies)
    create_feature(_dummies)
    return _dummies


In [None]:
test = pipeline(test)
test

In [None]:
test['是否投诉'] = test['是否投诉'].apply(lambda x: max(0, x))
test

In [None]:
result = best_model.predict(test[train.columns])
result

In [None]:
to_csv = read_excel(file_name='/result.xlsx')
to_csv = pd.DataFrame(result, index=to_csv.index, columns=to_csv.columns)
to_csv

In [None]:
to_csv.to_excel(RESULT_PATH + '/BMCB2202989 结果文档.xlsx', index=True)

In [None]:
from sklearn.preprocessing import StandardScaler
matrix = pd.DataFrame(matrix, index=[i for i in range(1, 11)], columns=[i for i in range(1, 11)])
sns.heatmap(StandardScaler().fit_transform(matrix))
plt.xlim((1, 10))
plt.ylim((1, 10))
create_figure('result')

In [None]:
_importance_ = pd.Series(rf.feature_importances_, index=train.columns)
_importance_.sort_values(inplace=True, ascending=False)
plt.figure(figsize=(20, 9))
plt.bar(_importance_.index[:10], _importance_.values[:10])
create_figure('importance')

In [None]:
_importance_.sort_values()

In [None]:
save_excel(dummies_n, file_name='clean_data.csv')

In [None]:
from shap import TreeExplainer

explainer = TreeExplainer(best_model)
shap_values = explainer.shap_values(train)
shap_values2 = explainer(train)