In [21]:
# Load libraries
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn import datasets
# Import train_test_split function
from sklearn.model_selection import train_test_split
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
from sklearn.metrics import accuracy_score
import operator
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV

In [22]:
# Load data
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [23]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier()
parameters = {"n_neighbors":[2,3,4,5,6]}
cv=GridSearchCV(neigh, param_grid=parameters, cv=4)
cv_result = cv.fit(X,y)
cv_result.best_params_
neigh = KNeighborsClassifier(n_neighbors=cv_result.best_params_['n_neighbors'])
neigh.fit(X,y)
prey=neigh.predict(X)
resi=y-prey
for i in range(int(len(X)*0.05)):
    max_index, max_number = max(enumerate(abs(resi)), key=operator.itemgetter(1))
    if max_number!=0:
        resi=np.delete(resi,max_index,axis=0)
        X=np.delete(X,max_index,axis=0)
        y=np.delete(y,max_index,axis=0)



In [24]:
#调sklearn库测试
log=LogisticRegression(solver='liblinear',multi_class='auto')
abc = AdaBoostClassifier(n_estimators=50,base_estimator=log,algorithm="SAMME",learning_rate=1)
abc.fit(X,y)
y_pred=abc.predict(X)
metrics.accuracy_score(y, y_pred)

0.6758620689655173

In [25]:
def cv_optimize(X, y, weight, n_folds=4):
    est = LogisticRegression(solver='liblinear',multi_class='auto')
    parameters = {"C": [1e-4, 5e-4, 1e-3, 1e-2, 1e-1, 1.0, 10, 100]}
    gridclassifier=GridSearchCV(est, param_grid=parameters, cv=4)
    grid_result = gridclassifier.fit(X,y,sample_weight=weight)
    return grid_result

In [26]:
#训练弱分类器
def trainfxi(reg, fx, i, x, y, weight):
    error = 0
    tempWeight=[]
    min_index, min_number = min(enumerate(weight), key=operator.itemgetter(1))
    for n in range(len(weight)):
        tempWeight.append(weight[n]*len(weight))
    reg.fit(x,y,sample_weight=tempWeight)
    
    error=calcErrorNum(reg, x, y, weight)
    alpha = calcAlpha(error)
    fx[i]['reg'] = reg
    fx[i]['alpha'] = alpha
    if alpha<0.001:
        cv=cv_optimize(x,y,tempWeight)
        c=cv.best_params_['C']
        reg2=LogisticRegression(C=c,solver='liblinear',multi_class='auto')
        reg2.fit(x,y,sample_weight=tempWeight)
        error2=calcErrorNum(reg2, x, y, weight)
        alpha2= calcAlpha(error2)
        fx[i]['reg'] = reg2
        fx[i]['alpha'] = alpha2
        print(c)
        fx[i]['cv']=c
    newWeight = calcNewWeight(fx[i]['alpha'], weight,fx[i]['reg'], x, y)
    return newWeight

In [27]:
#计算弱分类器错误率
def calcErrorNum(reg, x, y, weight):
    error=0
    for i in range(len(x)):
        if reg.predict(x[i].reshape(1,x.shape[1]))!=y[i]:
            error=error+weight[i]
    return error

In [28]:
#计算弱分类器权重
def calcAlpha(error):
    if error!=0:
        alpha = 1/2 * math.log((1-error)/error)
    else:
        alpha=1000000000
    return alpha

In [29]:
#计算权重
def calcNewWeight(alpha, weight, reg, x, y):
    newWeight = np.zeros(len(weight))
    sumWeight = 0
    for i in range(len(weight)):
        flag = 1
        if reg.predict(x[i].reshape(1,x.shape[1]))!=y[i]: 
            flag=-1
        weighti = weight[i]*math.exp(-alpha*flag)
        newWeight[i]=weighti
        sumWeight += weighti
    if sumWeight!=0:
        newWeight = newWeight / sumWeight
    newWeight=list(newWeight)
    return newWeight

In [30]:
#训练adaboost模型
def trainAdaBoost(x, y, errorThreshold, maxIterNum):
    fx = {}
    weight = []
    xNum = len(x)
    times=0
    for i in range(xNum):
        w = float(1/xNum)
        weight.append(w)
 
    for i in range(maxIterNum):
        fx[i] = {}
        newWeight = trainfxi(LogisticRegression(C=0.002,solver='liblinear',multi_class='auto'), fx, i, x, y, weight)
        fx[i]['weight']=newWeight
        weight = newWeight
        fxError = calcFxError(fx, x, y)
        fx[i]['error']=fxError
        times+=1
        if fxError<errorThreshold: break
    
    return fx,times

In [31]:
#计算模型错误率
def calcFxError(fx, x, y):
    errorNum = 0
    for i in range(len(x)):
        fi = 0
        prey=np.zeros(len(np.unique(y)))
        for j in range(len(fx)):
            fxiAlpha = fx[j]['alpha']
            preyi=fx[j]['reg'].predict_proba(x[i].reshape(1,x.shape[1]))
            prey=prey+preyi[0]*fxiAlpha
        max_index, max_number = max(enumerate(prey), key=operator.itemgetter(1))
        if(max_index!=y[i]):errorNum += 1
    return errorNum/len(x)

In [32]:
#训练后预测
def pre(fx,x):
    resulty=np.zeros(x.shape[0])
    for i in range(len(x)):
        fi = 0
        prey=np.zeros(len(fx[0]['reg'].coef_))
        for j in range(len(fx)):
            fxiAlpha = fx[j]['alpha']
            preyi=fx[j]['reg'].predict_proba(x[i].reshape(1,x.shape[1]))
            prey=prey+preyi[0]*fxiAlpha
        max_index, max_number = max(enumerate(prey), key=operator.itemgetter(1))
        resulty[i]=max_index
    return resulty

In [33]:
#测试
fx,times=trainAdaBoost(X,y,0.01,50)
fx,times



10




100


({0: {'reg': LogisticRegression(C=0.002, class_weight=None, dual=False, fit_intercept=True,
                      intercept_scaling=1, l1_ratio=None, max_iter=100,
                      multi_class='auto', n_jobs=None, penalty='l2',
                      random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                      warm_start=False),
   'alpha': 0.3674099384802557,
   'weight': [0.0051020408163265415,
    0.0051020408163265415,
    0.0051020408163265415,
    0.0051020408163265415,
    0.0051020408163265415,
    0.0051020408163265415,
    0.0051020408163265415,
    0.0051020408163265415,
    0.0051020408163265415,
    0.0051020408163265415,
    0.0051020408163265415,
    0.0051020408163265415,
    0.0051020408163265415,
    0.0051020408163265415,
    0.0051020408163265415,
    0.0051020408163265415,
    0.0051020408163265415,
    0.0051020408163265415,
    0.0051020408163265415,
    0.0051020408163265415,
    0.0051020408163265415,
    0.0051020408163265415,
    0.

In [34]:
tools=pd.read_csv("cleaned data.csv")
tools.loc[tools['Quality_label']==1]=3
tools.loc[tools['Quality_label']==1/3]=1
tools.loc[tools['Quality_label']==2/3]=2
i_train, i_test= train_test_split(range(len(tools)), train_size=0.8, test_size=0.2)
x=tools[['Parameter1', 'Parameter2', 'Parameter3', 'Parameter4', 'Parameter5',
       'Parameter6', 'Parameter7', 'Parameter8', 'Parameter9', 'Parameter10',
       'Attribute1', 'Attribute2', 'Attribute3', 'Attribute4', 'Attribute5',
       'Attribute6', 'Attribute7', 'Attribute8', 'Attribute9', 'Attribute10']].values
y=tools['Quality_label'].values
trainx=x[i_train]
trainy=y[i_train]
testx=x[i_test]
testy=y[i_test]
reg=LogisticRegression(C=0.002,solver='liblinear',multi_class='auto')
reg.fit(trainx,trainy)
reg.score(trainx,trainy)

0.696875

In [40]:
trainx

array([[3., 3., 3., ..., 3., 3., 3.],
       [2., 2., 2., ..., 2., 2., 2.],
       [2., 2., 2., ..., 2., 2., 2.],
       ...,
       [1., 1., 1., ..., 1., 1., 1.],
       [3., 3., 3., ..., 3., 3., 3.],
       [3., 3., 3., ..., 3., 3., 3.]])

In [15]:
#测试
fx,times=trainAdaBoost(trainx,trainy,0.015,10)
fx,times

({0: {'reg': LogisticRegression(C=0.002, class_weight=None, dual=False, fit_intercept=True,
                      intercept_scaling=1, l1_ratio=None, max_iter=100,
                      multi_class='auto', n_jobs=None, penalty='l2',
                      random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                      warm_start=False),
   'alpha': 2.534452101110116,
   'weight': [0.0001048218029350082,
    0.0001048218029350082,
    0.0001048218029350082,
    0.0001048218029350082,
    0.0001048218029350082,
    0.0001048218029350082,
    0.0001048218029350082,
    0.0001048218029350082,
    0.0001048218029350082,
    0.0001048218029350082,
    0.0001048218029350082,
    0.0001048218029350082,
    0.0001048218029350082,
    0.0001048218029350082,
    0.0001048218029350082,
    0.0001048218029350082,
    0.0001048218029350082,
    0.0001048218029350082,
    0.0001048218029350082,
    0.0001048218029350082,
    0.0001048218029350082,
    0.0001048218029350082,
    0.0

In [16]:
prey=pre(fx,testx)
accuracy_score(testy, prey)

0.9941666666666666