In [1]:
# Load libraries
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn import datasets
# Import train_test_split function
from sklearn.model_selection import train_test_split
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
from sklearn.metrics import accuracy_score
import operator
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [2]:
# Load data
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [3]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier()
parameters = {"n_neighbors":[2,3,4,5,6]}
cv=GridSearchCV(neigh, param_grid=parameters, cv=4)
cv_result = cv.fit(X,y)
cv_result.best_params_
neigh = KNeighborsClassifier(n_neighbors=cv_result.best_params_['n_neighbors'])
neigh.fit(X,y)
prey=neigh.predict(X)
resi=y-prey
for i in range(int(len(X)*0.05)):
    max_index, max_number = max(enumerate(abs(resi)), key=operator.itemgetter(1))
    if max_number!=0:
        resi=np.delete(resi,max_index,axis=0)
        X=np.delete(X,max_index,axis=0)
        y=np.delete(y,max_index,axis=0)



In [12]:
#调sklearn库测试
log=LogisticRegression(solver='liblinear',multi_class='auto')
abc = AdaBoostClassifier(n_estimators=50, base_estimator=log,learning_rate=1)
abc.fit(X,y)
y_pred=abc.predict(X)
metrics.accuracy_score(y, y_pred)

0.6689655172413793

In [6]:
#训练弱分类器
def trainfxi(reg, fx, i, x, y, weight):
    error = 0
    tempWeight=[]
    min_index, min_number = min(enumerate(weight), key=operator.itemgetter(1))
    for n in range(len(weight)):
        tempWeight.append(weight[n]/min_number)
    reg.fit(x,y,sample_weight=tempWeight)
    
    error=calcErrorNum(reg, x, y, weight)
    alpha = calcAlpha(error)
    fx[i]['reg'] = reg
    fx[i]['alpha'] = alpha
    if alpha<0.001:
        est = LogisticRegression(solver='liblinear',multi_class='auto')
        parameters = {"C": [1e-4, 5e-4, 1e-3, 1e-2, 1e-1, 1.0, 10, 100]}
        gridclassifier=GridSearchCV(est, param_grid=parameters, cv=4)
        grid_result = gridclassifier.fit(x,y,sample_weight=weight)
        c=grid_result.best_params_['C']
        reg2=LogisticRegression(C=c,solver='liblinear',multi_class='auto')
        reg2.fit(x,y,sample_weight=tempWeight)
        error2=calcErrorNum(reg2, x, y, weight)
        alpha2= calcAlpha(error2)
        max_depth=math.ceil(math.log(len(np.unique(y))))
        if alpha2<0.001:
            depth={"max_depth":range(2,10)}
            tree=DecisionTreeClassifier()
            CVsearch=GridSearchCV(estimator=tree,param_grid=depth,cv=10,n_jobs=-1)
            CVsearch.fit(x,y,sample_weight=tempWeight)
            best_depth=CVsearch.best_params_["max_depth"]
            reg2=DecisionTreeClassifier(random_state=0,max_depth=best_depth)
            reg2.fit(x,y,sample_weight=tempWeight)
            error2=calcErrorNum(reg2, x, y, weight)
            alpha2= calcAlpha(error2)
        fx[i]['reg'] = reg2
        fx[i]['alpha'] = alpha2
    newWeight = calcNewWeight(fx[i]['alpha'], weight,fx[i]['reg'], x, y)
    return newWeight

In [7]:
#计算弱分类器错误率
def calcErrorNum(reg, x, y, weight):
    error=0
    for i in range(len(x)):
        if reg.predict(x[i].reshape(1,x.shape[1]))!=y[i]:
            error=error+weight[i]
    return error

In [8]:
#计算弱分类器权重
def calcAlpha(error):
    if error!=0:
        alpha = 1/2 * math.log((1-error)/error)
    else:
        alpha=1000000000
    return alpha

In [9]:
#计算权重
def calcNewWeight(alpha, weight, reg, x, y):
    newWeight = np.zeros(len(weight))
    sumWeight = 0
    for i in range(len(weight)):
        flag = 1
        if reg.predict(x[i].reshape(1,x.shape[1]))!=y[i]: flag=-1
        weighti = weight[i]*math.exp(-alpha*flag)
        newWeight[i]=weighti
        sumWeight += weighti
    if sumWeight!=0:
        newWeight = newWeight / sumWeight
    newWeight=list(newWeight)
    return newWeight

In [10]:
#训练adaboost模型
def trainAdaBoost(x, y, errorThreshold, maxIterNum):
    fx = {}
    weight = []
    xNum = len(x)
    times=0
    for i in range(xNum):
        w = float(1/xNum)
        weight.append(w)
 
    for i in range(maxIterNum):
        fx[i] = {}
        newWeight = trainfxi(LogisticRegression(C=0.002,solver='liblinear',multi_class='auto'), fx, i, x, y, weight)
        fx[i]['weight']=newWeight
        weight = newWeight
        fxError = calcFxError(fx, x, y)
        fx[i]['error']=fxError
        times+=1
        if fxError<errorThreshold: break
    
    return fx,times

In [11]:
#计算模型错误率
def calcFxError(fx, x, y):
    errorNum = 0
    for i in range(len(x)):
        fi = 0
        prey=np.zeros(len(np.unique(y)))
        for j in range(len(fx)):
            fxiAlpha = fx[j]['alpha']
            preyi=fx[j]['reg'].predict_proba(x[i].reshape(1,x.shape[1]))
            prey=prey+preyi[0]*fxiAlpha
        max_index, max_number = max(enumerate(prey), key=operator.itemgetter(1))
        if(max_index!=y[i]):errorNum += 1
    return errorNum/len(x)

In [12]:
#训练后预测
def pre(fx,x,classnum):
    resulty=np.zeros(x.shape[0])
    for i in range(len(x)):
        fi = 0
        prey=np.zeros(classnum)
        for j in range(len(fx)):
            fxiAlpha = fx[j]['alpha']
            preyi=fx[j]['reg'].predict_proba(x[i].reshape(1,x.shape[1]))
            prey=prey+preyi[0]*fxiAlpha
        max_index, max_number = max(enumerate(prey), key=operator.itemgetter(1))
        resulty[i]=max_index
    return resulty

In [13]:
#二分类训练后预测
def prebinary(fx,x0,x1):
    resulty=np.zeros([x1.shape[0],x0.shape[0]]) 
    for i in range(x1.shape[0]):
        for h in range(x0.shape[0]):   
            prex=[[x0[h],x1[i]]]
            prey=np.zeros(2)
            for j in range(len(fx)):
                fxiAlpha = fx[j]['alpha']
                preyi=fx[j]['reg'].predict_proba(prex)
                prey=prey+preyi[0]*fxiAlpha
            max_index, max_number = max(enumerate(prey), key=operator.itemgetter(1))
            resulty[i][h]=max_index
    return resulty

In [14]:
#画图
def plot_decision_boundary(fx, X, y):
    set0=X[y]
    set1=X[~y]
    
    # set max and min, add a 0.5 margin
    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    h = 0.01
 
    prex,prey=np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    Z = prebinary(fx,prex,prey)
    # using contouf to plot decision boundary
    plt.contourf(xx, yy, Z)
    plt.scatter(set0[:,0],set0[:,1],marker="o",label='category 1')
    plt.scatter(set1[:,0],set1[:,1],marker="v",label='category 2')
    plt.legend()

In [15]:
i_train, i_test= train_test_split(range(len(X)), train_size=0.8, test_size=0.2)
trainx=X[i_train]
trainy=y[i_train]
testx=X[i_test]
testy=y[i_test]

In [16]:
#测试
fx,times=trainAdaBoost(trainx,trainy,0.01,50)
fx,times

({0: {'reg': LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
             intercept_scaling=1, max_iter=100, multi_class='auto',
             n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
             tol=0.0001, verbose=0, warm_start=False),
   'alpha': 1.550046144439117,
   'weight': [0.004504504504504504,
    0.004504504504504504,
    0.004504504504504504,
    0.004504504504504504,
    0.004504504504504504,
    0.004504504504504504,
    0.004504504504504504,
    0.004504504504504504,
    0.004504504504504504,
    0.004504504504504504,
    0.004504504504504504,
    0.004504504504504504,
    0.004504504504504504,
    0.09999999999999998,
    0.004504504504504504,
    0.004504504504504504,
    0.004504504504504504,
    0.004504504504504504,
    0.004504504504504504,
    0.004504504504504504,
    0.004504504504504504,
    0.004504504504504504,
    0.004504504504504504,
    0.09999999999999998,
    0.004504504504504504,
    0.004504504504504

In [17]:
prey=pre(fx,testx,3)
accuracy_score(testy, prey)

1.0

In [44]:
#二分测试
y=y==0
X=X[:,(0,1)]
fx,times=trainAdaBoost(X,y,0.01,50)
fx,times

({0: {'reg': LogisticRegression(C=0.002, class_weight=None, dual=False, fit_intercept=True,
             intercept_scaling=1, max_iter=100, multi_class='auto',
             n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
             tol=0.0001, verbose=0, warm_start=False),
   'alpha': 0.3209269430861964,
   'weight': [0.010000000000000042,
    0.010000000000000042,
    0.010000000000000042,
    0.010000000000000042,
    0.010000000000000042,
    0.010000000000000042,
    0.010000000000000042,
    0.010000000000000042,
    0.010000000000000042,
    0.010000000000000042,
    0.010000000000000042,
    0.010000000000000042,
    0.010000000000000042,
    0.010000000000000042,
    0.010000000000000042,
    0.010000000000000042,
    0.010000000000000042,
    0.010000000000000042,
    0.010000000000000042,
    0.010000000000000042,
    0.010000000000000042,
    0.010000000000000042,
    0.010000000000000042,
    0.010000000000000042,
    0.010000000000000042,
    0.01000000

In [45]:
tools=pd.read_csv("cleaned data1.csv")
i_train, i_test= train_test_split(range(len(tools)), train_size=0.8, test_size=0.2)
x=tools[['Parameter1', 'Parameter2', 'Parameter3', 'Parameter4', 'Parameter5',
       'Parameter6', 'Parameter7', 'Parameter8', 'Parameter9', 'Parameter10',
       'Attribute1', 'Attribute2', 'Attribute3', 'Attribute4', 'Attribute5',
       'Attribute6', 'Attribute7', 'Attribute8', 'Attribute9', 'Attribute10']].values
y=tools['Quality_label'].values
trainx=x[i_train]
trainy=y[i_train]
testx=x[i_test]
testy=y[i_test]
reg=DecisionTreeClassifier(random_state=0,max_depth=4)
reg.fit(trainx,trainy)
reg.score(testx,testy)

0.5775

In [46]:
#测试
fx,times=trainAdaBoost(trainx,trainy,0.1,20)
fx,times

({0: {'reg': DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=8,
               max_features=None, max_leaf_nodes=None,
               min_impurity_decrease=0.0, min_impurity_split=None,
               min_samples_leaf=1, min_samples_split=2,
               min_weight_fraction_leaf=0.0, presort=False, random_state=0,
               splitter='best'),
   'alpha': 0.3569220311655605,
   'weight': [0.0001551831160769635,
    0.00031685678073509936,
    0.0001551831160769635,
    0.0001551831160769635,
    0.0001551831160769635,
    0.0001551831160769635,
    0.0001551831160769635,
    0.0001551831160769635,
    0.0001551831160769635,
    0.0001551831160769635,
    0.0001551831160769635,
    0.0001551831160769635,
    0.00031685678073509936,
    0.0001551831160769635,
    0.0001551831160769635,
    0.0001551831160769635,
    0.0001551831160769635,
    0.00031685678073509936,
    0.0001551831160769635,
    0.0001551831160769635,
    0.00031685678073509936,
    0.00031685

In [47]:
prey=pre(fx,testx,4)
accuracy_score(testy, prey)

0.6283333333333333