In [4]:
from sklearn.svm import SVC
import pandas as pd
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.externals import joblib

#设定随机种子 防止模型效果不稳定
np.random.seed(50)
def train():
    #1.加载数据
    path='.//train_set.csv'
    reader = pd.read_csv(path, sep=',',iterator=True,engine='python')
    df = reader.get_chunk(25318)

    #2.数据处理
    #对数据进行分割处理 划分x,y
    X=df.iloc[:,1:-1]
    Y=df.iloc[:,-1]
    # print(Y.shape)

    # 3.特征工程
    #对x 中的字符型数据进行编码 变成数值型数据 1,2,3,....
    encoder=LabelEncoder()
    X_list=['job','marital','education','default','housing','loan','contact','poutcome','month']
    for index,e in enumerate(X_list):
        X[e] = encoder.fit_transform(X[e])   #对每一个属性列进行操作
        joblib.dump(encoder, './model/encoder'+str(index)+'.pkl')   #将每次的参数保存
    #对各行文本进行归一化
    ss=StandardScaler()  #将每列数据归一化
    X=ss.fit_transform(X)

    #划分训练集和测试集
    x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.2)  #测试集占比0.2

    #构建模型
    svm=SVC(kernel='rbf',gamma=0.16,C=1.68,probability=True)  #不断调参 得出的参数
    svm.fit(x_train,y_train)

    #查看模型效果
    print('测试集分类效果:{}'.format(svm.score(x_test,y_test)))
    print('训练集分类效果:{}'.format(svm.score(x_train,y_train)))

    print("测试集每个客户的概率{}".format(svm.predict_proba(x_test)))
    #模型持久化
    joblib.dump(ss, './model/ss.pkl')  #保存模型参数
    joblib.dump(svm, './model/svm.pkl')


#建立加载模型函数
class Model_Loader(object):
    def __init__(self):
        # 1. 加载恢复模型
        self.encoder_list=[]
        for i in range(9):
           self.encoder_list.append(joblib.load('./model/encoder'+str(i)+'.pkl'))
        self.ss = joblib.load('./model/ss.pkl')
        self.algo = joblib.load('./model/svm.pkl')
    def fetch_predict_value(self):
        #导入预测数据
        path1='.//test_set.csv'
        reader = pd.read_csv(path1, sep=',', iterator=True, engine='python')
        df = reader.get_chunk(10000)
        x_test = df.iloc[:, 1:]
        #下面是对每个列属性的数据进行数值化操作
        # X_list = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome', 'month']
        x_test['job']=self.encoder_list[0].transform(x_test['job'])
        x_test['marital']=self.encoder_list[1].transform(x_test['marital'])
        x_test['education']=self.encoder_list[2].transform(x_test['education'])
        x_test['default']=self.encoder_list[3].transform(x_test['default'])
        x_test['housing']=self.encoder_list[4].transform(x_test['housing'])
        x_test['loan']=self.encoder_list[5].transform(x_test['loan'])
        x_test['contact']=self.encoder_list[6].transform(x_test['contact'])
        x_test['poutcome']=self.encoder_list[7].transform(x_test['poutcome'])
        x_test['month'] = self.encoder_list[8].transform(x_test['month'])

        #将每一个属性列进行归一化操作
        x_test=self.ss.transform(x_test)
        #预测数据
        predict_proba=self.algo.predict_proba(x_test)
        _predict_proba=[]
        #将每个预测为成功的概率取出
        for i in predict_proba:
            _predict_proba.append(i[1])
        #将x_test的ID值与x_test的预测值拼接在一起
        x_predict=pd.concat([df.iloc[:,0],pd.DataFrame(_predict_proba)],axis=1)
        return x_predict




if __name__ == '__main__':
    # train()
    loader=Model_Loader()
    predict=loader.fetch_predict_value()
    
    test=pd.DataFrame(data =predict)
    test.to_csv('F:/test2.csv')
    print(predict)




         ID         0
0     25318  0.061109
1     25319  0.019177
2     25320  0.023453
3     25321  0.879477
4     25322  0.016781
5     25323  0.047828
6     25324  0.063172
7     25325  0.043172
8     25326  0.035015
9     25327  0.089415
10    25328  0.487182
11    25329  0.068956
12    25330  0.038161
13    25331  0.065449
14    25332  0.041160
15    25333  0.047666
16    25334  0.053109
17    25335  0.048952
18    25336  0.060418
19    25337  0.093674
20    25338  0.065179
21    25339  0.025625
22    25340  0.045683
23    25341  0.049470
24    25342  0.176068
25    25343  0.052533
26    25344  0.052442
27    25345  0.080568
28    25346  0.048052
29    25347  0.061623
...     ...       ...
9970  35288  0.033058
9971  35289  0.080863
9972  35290  0.025525
9973  35291  0.060089
9974  35292  0.257538
9975  35293  0.091136
9976  35294  0.094182
9977  35295  0.041421
9978  35296  0.087353
9979  35297  0.089964
9980  35298  0.120470
9981  35299  0.050428
9982  35300  0.233197
9983  3530