In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import klib as kl
import missingno as mns
import os
import warnings
import tqdm
import numba

os.environ['KERAS_BACKEND'] = 'tensorflow'
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
warnings.filterwarnings('ignore')
plt.rcParams['font.sans-serif'] = ['Kaiti']
plt.rcParams['axes.unicode_minus'] = False
PIC_PATH = "../../models/image/image1/internet"
DATA_PATH = '../../data'
RESULT_PATH = '../../data/summary/'
MODEL_PATH = '../../models/model1'
import pathlib2 as pl2
import shutil


def creat_dir():
    pic_path = pl2.Path(PIC_PATH)
    if os.path.exists(PIC_PATH):
        shutil.rmtree(PIC_PATH)
    pic_path.mkdir(parents=True, exist_ok=True)
    if not os.path.exists(RESULT_PATH):
        os.mkdir(RESULT_PATH)
    if not os.path.exists(MODEL_PATH):
        os.mkdir(MODEL_PATH)


creat_dir()
figure_count = 0


def create_figure(figure_name, dpi=800):
    global figure_count
    figure_count += 1
    plt.savefig(PIC_PATH + f'/figure{figure_count}_{figure_name}.png', dpi=dpi)


from joblib import dump, load


def save_model(model, model_name: str) -> None:
    dump(model, MODEL_PATH + model_name)


def load_model(model_name: str):
    return load(MODEL_PATH + model_name)


In [None]:
internet = pd.read_excel(DATA_PATH + '/附件2上网业务用户满意度数据.xlsx', index_col=0)
test = pd.read_excel(DATA_PATH + '/附件4上网业务用户满意度预测数据.xlsx', index_col=0)

internet

In [None]:
test['场景备注数据'] = test['注明内容']
test['现象备注数据'] = test['注明内容.1']
test['APP大类备注'] = test[np.nan]
test['APP小类视频备注'] = test['注明内容.2']
test['APP小类游戏备注'] = test['注明内容.3']
test['APP小类上网备注'] = test['注明内容.4']

In [None]:
col = [i for i in internet.columns if i not in test.columns][4:]
inter_drop = internet[col]
internet.drop(col, axis=1, inplace=True)
col

In [None]:
col = [i for i in test.columns if i not in internet.columns]
test_drop = test[col]
test.drop(col, axis=1, inplace=True)
col

In [None]:
internet

In [None]:
columns = internet.columns[4:-12]
columns = [column for column in columns if '备注' not in column]
columns

In [None]:
internet[columns] = internet[columns].applymap(lambda x: 0 if x == -1 else 1)
internet

In [None]:
unique = internet[internet['其他，请注明'] != 0]
unique['场景备注数据'].unique()

In [None]:
def replace_place(data: pd.DataFrame, bad_data:pd.DataFrame, col_name: str) -> pd.DataFrame:
    """
    对特别注明的信号不好的地区进行矫正
    :param data: 原始数据
    :param bad_data: 需要矫正的数据
    :param col_name: 需要矫正的列
    :return: 替换后的原始数据
    """

    replace_value = ['地下', '电梯', '<NA>', '道路', '山区', '医院', '全部', np.nan]
    data[col_name] = 0
    bad_data[col_name] = 0

    replace1 = bad_data.query('场景备注数据.str.contains("地下|地铁|地库|车库|地库")')
    replace1[col_name] += 1

    replace2 = bad_data.query('场景备注数据.str.contains("电梯")')
    replace2[col_name] += 1

    replace3 = bad_data.query('场景备注数据.str.contains("家|小区|公寓|住|屋")')
    # replace3[col_name] = '<NA>'
    replace3['居民小区'] = 1

    replace4 = bad_data.query('场景备注数据.str.contains("村|乡")')
    # replace4[col_name] = '<NA>'
    replace4['农村'] = 1

    replace5 = bad_data.query('场景备注数据.str.contains("路|环|途|车|街")')
    replace5[col_name] += 1

    replace6 = bad_data.query('场景备注数据.str.contains("山")')
    replace6[col_name] += 1

    replace7 = bad_data.query('场景备注数据.str.contains("医院")')
    replace7[col_name] += 1

    replace8 = bad_data.query('场景备注数据.str.contains("超市|市场|集市")')
    replace8[col_name] += 1

    data.update(replace1)
    data.update(replace2)
    data.update(replace3)
    data.update(replace4)
    data.update(replace5)
    data.update(replace6)
    data.update(replace7)
    data.update(replace8)

    # replace9 = bad_data.query('not 场景备注数据 in @replace_value')
    # replace9[col_name] += 1
    # data.update(replace9)
    replace10 = bad_data.query('场景备注数据.str.contains("哪|都|所有|任何")')
    replace10[col_name] = 10
    data.update(replace10)

    data[col_name] += data.loc[:, '居民小区': '高铁'].astype(int).sum(axis=1)
    data.drop(data.loc[:, '居民小区': '场景备注数据'].columns, axis=1, inplace=True)
    return data

def entire(data: pd.DataFrame=internet, col_name: str='全部都卡顿', target_col: str=None):
    # print(data.columns)
    data[target_col] = data[col_name].apply(lambda x: 10 if x else x)
    data.drop(col_name, axis=1, inplace=True)
    return data


def get_feature(data: pd.DataFrame, bad_col: pd.DataFrame):
    # data = df.copy()
    data = replace_place(data, bad_col, '场景问题次数')
    data['现象问题次数'] = data.loc[:, '网络信号差/没有信号': '手机上网速度慢'].astype(int).sum(axis=1)
    data.drop(data.loc[:, '网络信号差/没有信号': '现象备注数据'].columns, axis=1, inplace=True)
    data['大类问题次数'] = data.loc[:, '看视频卡顿': '手机支付较慢'].astype(int).sum(axis=1)
    data.drop(data.loc[:, '看视频卡顿': 'APP大类备注'].columns, axis=1, inplace=True)
    data['小类视频问题次数'] = data.loc[:, '爱奇艺': '咪咕视频'].astype(int).sum(axis=1)
    data.drop(data.loc[:, '爱奇艺': 'APP小类视频备注'].columns, axis=1, inplace=True)
    data = entire(data, target_col='小类视频问题次数', col_name='全部都卡顿')
    data['小类游戏问题次数'] = data.loc[:, '和平精英': '阴阳师'].astype(int).sum(axis=1)
    data.drop(data.loc[:, '和平精英': 'APP小类游戏备注'].columns, axis=1, inplace=True)
    data = entire(data, '全部游戏都卡顿', target_col='小类游戏问题次数')
    data['全部应用问题次数'] = data.loc[:, '微信': '拼多多'].astype(int).sum(axis=1)
    data.drop(data.loc[:, '微信': 'APP小类上网备注'], axis=1, inplace=True)
    data = entire(data, '全部网页或APP都慢', target_col='全部应用问题次数')
    return data

col = internet.columns[4:]
internet = get_feature(internet, unique)
internet

In [None]:
kl.missingval_plot(internet)
create_figure('missing_plot')

In [None]:
def boolean_replace(x):
    if x not in ['是', '否']:
        return x
    return 1 if x == '是' else 0

def missing_clean(data: pd.DataFrame):
    data[['上网质差次数', '脱网次数', '微信质差次数']] = data[['上网质差次数', '脱网次数', '微信质差次数']].fillna(0)
    data.dropna(axis=0, inplace=True)
    data = data.applymap(boolean_replace)
    return data

internet = missing_clean(internet)
internet

In [None]:
shell = internet['终端品牌'].unique()
shell_type = internet['终端品牌类型'].unique()

def data_clean(data: pd.DataFrame):
    data['终端品牌'] = data['终端品牌'].astype(str)
    replace = data.query('终端品牌.str.contains("移动|联通|电信")')
    replace['终端品牌'] = '其他'
    data.update(replace)
    data['终端品牌'] = data['终端品牌'].replace([0, '0'], '其他')
    data['终端品牌'] = data['终端品牌'].apply(lambda x: '其他' if x not in shell else x)
    data['终端品牌类型'] = data['终端品牌类型'].apply(lambda x: '其他' if x not in shell_type else x)
    _ = data['终端品牌类型'].apply(lambda x: ' '.join(x.split(' ')[:2]) if isinstance(x, str) else '其他').apply(
        lambda x: x.split('-')[0]
    ).apply(
        lambda x: x.split('_')[0]
    )
    data['终端品牌类型'] = _.apply(check)

    return data['终端品牌类型'].unique()

def check(string: str):
    import re
    if ' ' in string:
        return string.split(' ')[0]
    if re.match(r'^\d', string):
        return 'Num'
    re_str = r'A\d{4}'
    if re.match(re_str, string, re.I):
        return 'A_Num'
    if re.match(r'\D+', string, re.I):
        return 'word'
    if re.match(r'^HM', string, re.I):
        return 'HM'
    if re.match(r'^RMX\d+', string):
        return 'type_rmx'
    if re.match(r'^M.+[A-Z]$', string):
        return 'type_m'
    if re.match(r'^P.+0$', string):
        return 'type_p'
    if re.match(r'V\d+[A-Za-z]', string):
        return 'type_v'
    if re.match(r'^[A-Za-z].+\d+', string):
        return 'num_object'
    return '其他'

data_clean(internet)

In [None]:
from sklearn.preprocessing import LabelEncoder

def label_encoder(data: pd.DataFrame):
    ll1 = LabelEncoder().fit(data['终端品牌'].astype(str))
    data['终端品牌'] = ll1.transform(data['终端品牌'].astype(str))
    save_model(ll1, '/LabelEncoder_3.model')

    ll2 = LabelEncoder().fit(data['终端品牌类型'].astype(str))
    data['终端品牌类型'] = ll2.transform(data['终端品牌类型'].astype(str))
    save_model(ll2, '/LabelEncoder_4.model')

    return star_map(data)

def star_map(data: pd.DataFrame):
    dct = {
            '未评级': -1,
            '准星': 0,
            '一星': 1,
            '二星': 2,
            '三星': 3,
            '银卡': 4,
            '金卡': 5,
            '白金卡': 6,
            '钻石卡': 7
        }
    data['客户星级标识'] = data['客户星级标识'].apply(lambda x: dct.get(x))
    return pd.get_dummies(data, columns=['性别'])

internet = label_encoder(internet)
internet

In [None]:
train = internet.iloc[:, 4:]
target = internet.iloc[:, :4].astype(np.int64)

train

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost.sklearn import XGBClassifier
from lightgbm.sklearn import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import  classification_report, f1_score, precision_score, recall_score

def split_data(train_data, predict):

    return train_test_split(train_data, predict, test_size=.3, random_state=42)

def train_model(train_data, predict):
    X_train, X_test, y_train, y_test = split_data(train_data, predict)
    models = [DecisionTreeClassifier(), KNeighborsClassifier(), RandomForestClassifier(),
              GradientBoostingClassifier(), XGBClassifier(), LGBMClassifier()]
    scores = pd.DataFrame()
    y_test = y_test.astype(np.int64)

    for model in models:
        print(model.__class__.__name__)
        reports = ''
        acc = []
        f1 = []
        precision = []
        recall = []
        for col in y_test.columns:
            print(col)
            try:
                model.fit(X_train, y_train[col])
                pre = model.predict(X_test)
            except ValueError:
                ll = LabelEncoder().fit(y_train[col])
                model.fit(X_train, ll.transform(y_train[col]))
                pre = ll.inverse_transform(model.predict(X_test))
            acc.append(model.score(X_test, y_test[col]))
            f1.append(f1_score(y_test[col], pre, average='micro'))
            precision.append(precision_score(y_test[col], pre, average='weighted'))
            recall.append(recall_score(y_test[col], pre, average='weighted'))
            report = classification_report(y_test[col], pre)
            reports += report
        with open(RESULT_PATH + model.__class__.__name__ + '_internet.txt', 'w') as f:
            f.write(reports)


        scores[f'{model.__class__.__name__}'] = [np.mean(np.array(acc)), np.mean(f1), np.mean(precision), np.mean(recall)]

    return scores

train_model(train, target)

In [None]:
for i in target.columns:
    plt.figure(figsize=(8, 8))
    counts = target[i].value_counts()
    plt.pie(counts, labels=counts.index, autopct="%1.1f%%")
    create_figure(f'{i}_describe')

In [None]:
internet.iloc[:, 0].value_counts()

In [None]:
internet.iloc[:, :4] = internet.iloc[:, :4].astype(np.int64)
internet.dtypes

In [None]:
regress_feature = ['套外流量（MB）', '套外流量费（元）', '当月MOU']
classic_feature = [col for col in internet.columns if col not in regress_feature]
internet[regress_feature] = internet[regress_feature].astype(np.float64)
internet[classic_feature] = internet[classic_feature].astype(np.int64)

internet

In [None]:
from sklearn.metrics import accuracy_score, f1_score
from imblearn.over_sampling import SVMSMOTE,SMOTEN
from sklearn.impute import KNNImputer

def get_data(data: pd.DataFrame):
    temp = pd.DataFrame()
    for i in range(1, 11):
        query = data.query('手机上网整体满意度 == @i')
        if 60 <= query.__len__() <120:
            sample = 60
        elif 120 <= query.__len__() < 180:
            sample = 100
        else:
            sample = 180
        temp = pd.concat([temp, data.sample(sample)], axis=0)
    return temp

def oversampling(data: pd.DataFrame):
    value_counts = data.iloc[:, 0].value_counts()
    _index = value_counts[value_counts<value_counts.max()].index.tolist()
    sm = SVMSMOTE()
    # print(data)
    X, y = sm.fit_resample(data.iloc[:, 4:], data[['手机上网整体满意度']].values.reshape((-1, 1)))
    frame = data.iloc[:, :4]
    y = pd.DataFrame(y[frame.shape[0]:], columns=[data.columns[0]])
    frame = pd.concat([frame, y], axis=0)
    frame = pd.DataFrame(KNNImputer(n_neighbors=4).fit_transform(frame).astype(np.int64), columns=frame.columns)
    return pd.concat([frame, X], axis=1)

def split_train_test(init_data):
    _train = init_data.iloc[:, 4:]
    _target = init_data.iloc[:, :4]
    return _target, _train

def false_target(data: pd.DataFrame, _model=RandomForestClassifier()):
    init_data = pd.DataFrame()
    _best_score = 0
    for i in range(40):
        if i == 0:
            init_data = oversampling(get_data(data))
        init_data.drop_duplicates(inplace=True)
        _target, _train = split_train_test(init_data)
        _model.fit(_train, _target)

        new_data =get_data(data)
        _test_target, _test_train = split_train_test(new_data)

        _pre = pd.Series(_model.predict(_test_train)[:, 0])
        _score = accuracy_score(_pre, _test_target.iloc[:, 0])
        _f1 = f1_score(_pre, _test_target.iloc[:, 0], average='weighted')
        _best_score = max(_best_score, _f1)
        print(_score, _f1)

        _proba_max = pd.DataFrame(_model.predict_proba(_test_train)[0], index=_test_train.index).max(axis=1)
        _proba_max = _proba_max[_proba_max > .5]


        _pre.index = _test_target.index
        _pre = _pre[_pre==_test_target.iloc[:, 0]]
        _index = [i for i in _pre.index if i in _proba_max.index]
        init_data = pd.concat([init_data, new_data.loc[_index]], axis=0)

        if _f1 > .5 and _best_score - _f1 < .01:
            save_model(_model, '/best_model_internet.model')
    return _model, _best_score

In [None]:
rf, best_score = false_target(internet, RandomForestClassifier())
best_score

In [None]:
f1_score(target.手机上网整体满意度, rf.predict(train)[:, 0], average='weighted')

In [None]:
bm = load_model('/best_model_internet.model')
accuracy_score(target.手机上网整体满意度, bm.predict(train)[:, 0])

In [None]:
result = pd.read_excel(DATA_PATH + '/result.xlsx', sheet_name='上网', index_col=0)
result

In [None]:
def label_transform(data: pd.DataFrame):
    ll1 = load_model('/LabelEncoder_3.model')
    ll2 = load_model('/LabelEncoder_4.model')
    data['终端品牌'] = ll1.transform(data['终端品牌'].astype(str))
    data['终端品牌类型'] = ll2.transform(data['终端品牌类型'].astype(str))
    return star_map(data)

def pipeline(data: pd.DataFrame):
    data[columns] = data[columns].applymap(lambda x: 0 if x == -1 else 1)
    bad_col = data[data['其他，请注明'] != 0]
    # print(bad_col)
    data = get_feature(data, bad_col)
    data = missing_clean(data)
    data_clean(data)
    data = label_transform(data)
    data[regress_feature] = data[regress_feature].astype(np.float64)
    data[[i for i in classic_feature if i in data.columns]] = data[[i for i in classic_feature if i in data.columns]].astype(np.int64)
    return data

In [None]:
test = test[col]
test_ = test.copy()

test_ = pipeline(test_)
test_

In [None]:
test_ = test_[internet.columns[4:]]
test_

In [None]:
res = bm.predict(test_)
result = pd.DataFrame(res, index=result.index, columns=result.columns)
result

In [None]:
result.to_excel(DATA_PATH + '/result.xlsx', index=True, sheet_name='上网')

In [None]:
internet.to_csv(RESULT_PATH + '/internet.csv', index=True)