# 集成学习及随机森林

## 数据预处理

In [88]:
import os
filePath = '/Users/yangwei/Desktop/二分类图/'
items = os.listdir(filePath)

In [89]:
len(items)

350

In [90]:
#为了避免有不是jpg格式的文件，有临时文件时可能会有问题，多做一步
namelist=[]
for name in items:
    if name.endswith(".jpg"):
        namelist.append(name)

In [91]:
namelist[0] #226表示图片编号，后面的male表示性别

'226_male.jpg'

In [92]:
#需要将性别（输出）拆解出来
namelist[0][:-4].split("_")

['226', 'male']

**读取图片，将图片作为特征X，将人的性别作为输出Y**

In [93]:
import numpy as np
from PIL import Image

X=np.empty((0,4096))
n_pixels=64
for i in namelist:
    img=np.array(Image.open(filePath+i).convert('L'), 'f')
    img_new=img.reshape(1,4096)
    X=np.vstack((X,img_new))   

In [94]:
X.shape

(350, 4096)

In [95]:
X[:5]

array([[152., 158., 169., ...,   0.,   4.,  21.],
       [222., 226., 230., ...,  98., 102., 104.],
       [ 53.,  84., 121., ...,  54., 118.,  97.],
       [103.,  90.,  52., ..., 149., 143., 131.],
       [150., 161., 174., ...,  95.,  87.,  82.]])

In [96]:
y=[]
for i in namelist:
    sex=i[:-4].split("_")[1]
    y.append(sex)

In [97]:
y=np.array(y)
y[:5]

array(['male', 'male', 'male', 'male', 'male'], dtype='<U6')

**划分测试集和训练集**

In [108]:
from sklearn.model_selection import train_test_split
train_X, test_X,train_y,test_y = train_test_split(X, y, test_size=0.3,random_state=1)

In [109]:
train_X.shape

(245, 4096)

In [110]:
test_X.shape

(105, 4096)

## 单个模型的效果

In [117]:
from sklearn import tree
from sklearn import linear_model 
from sklearn import neighbors

model1=tree.DecisionTreeClassifier(random_state=123)    #决策树分类模型
model2=neighbors.KNeighborsClassifier() #KNN分类模型
model3=linear_model.LogisticRegression(max_iter=1000,random_state=123)#逻辑回归模型

In [118]:
#模型训练
model1.fit(train_X,train_y)
model2.fit(train_X,train_y)
model3.fit(train_X,train_y)

LogisticRegression(max_iter=1000, random_state=123)

In [119]:
#预测
pred1=model1.predict(test_X)
pred2=model2.predict(test_X)
pred3=model3.predict(test_X)

In [120]:
model1.score(test_X,test_y)

0.8857142857142857

In [121]:
model2.score(test_X,test_y)

0.9714285714285714

In [122]:
model3.score(test_X,test_y)

0.9809523809523809

## 简单集成技术

**（1）最大投票法**

**（2）平均法**

**（3）加权平均法**

### （1）最大投票法

**最大投票法参考代码**

In [123]:
#最终结果
import numpy as np
import pandas as pd
final_pred = np.array([])
for i in range(0,len(test_X)):
    temp=pd.Series([pred1[i], pred2[i], pred3[i]])
    final_pred =np.append(final_pred,temp.mode()[0])

In [124]:
#最终结果
final_pred

array(['male', 'male', 'female', 'male', 'male', 'male', 'male', 'male',
       'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male',
       'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male',
       'male', 'male', 'female', 'male', 'male', 'male', 'male', 'male',
       'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male',
       'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male',
       'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male',
       'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male',
       'male', 'male', 'male', 'female', 'male', 'male', 'male', 'male',
       'male', 'male', 'male', 'female', 'male', 'male', 'male', 'male',
       'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male',
       'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male',
       'male', 'male', 'male', 'male', 'male', 'female', 'male', 'male',
       'male'], dtype='<U32')

In [125]:
import pandas as pd
out=pd.DataFrame({"真实类别":test_y,"最终结果":final_pred,"pred1":pred1,"pred2":pred2,"pred3":pred3})
out

Unnamed: 0,真实类别,最终结果,pred1,pred2,pred3
0,male,male,male,male,male
1,male,male,female,male,male
2,female,female,female,female,female
3,male,male,male,male,male
4,male,male,male,male,male
...,...,...,...,...,...
100,male,male,male,male,male
101,female,female,female,female,female
102,male,male,male,male,male
103,male,male,male,male,male


In [126]:
from sklearn.metrics import accuracy_score
accuracy_score(test_y,final_pred) 

0.9809523809523809

**也可以在sklearn中使用“VotingClassifier”模块**

In [127]:
from sklearn.ensemble import VotingClassifier
model_final=VotingClassifier(estimators=[('决策树模型', model1), ('KNN', model2),('逻辑回归', model3)], voting='hard')

In [128]:
model_final.fit(train_X,train_y)
final_pred=model_final.predict(test_X)

In [129]:
final_pred

array(['male', 'male', 'female', 'male', 'male', 'male', 'male', 'male',
       'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male',
       'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male',
       'male', 'male', 'female', 'male', 'male', 'male', 'male', 'male',
       'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male',
       'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male',
       'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male',
       'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male',
       'male', 'male', 'male', 'female', 'male', 'male', 'male', 'male',
       'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male',
       'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male',
       'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male',
       'male', 'male', 'male', 'male', 'male', 'female', 'male', 'male',
       'male'], dtype='<U6')

In [130]:
accuracy_score(test_y,final_pred) 

0.9714285714285714

### （2）平均法

In [131]:
pred1=model1.predict_proba(test_X)
pred2=model2.predict_proba(test_X)
pred3=model3.predict_proba(test_X)

In [132]:
finalpred=(pred1+pred2+pred3)/3 

In [133]:
finalpred

array([[6.69094410e-14, 1.00000000e+00],
       [3.33334219e-01, 6.66665781e-01],
       [9.33332727e-01, 6.66672733e-02],
       [4.55768556e-09, 9.99999995e-01],
       [6.66666702e-02, 9.33333330e-01],
       [5.01596024e-05, 9.99949840e-01],
       [3.99947462e-01, 6.00052538e-01],
       [2.88894834e-11, 1.00000000e+00],
       [6.66666670e-02, 9.33333333e-01],
       [2.44608038e-11, 1.00000000e+00],
       [1.81696510e-04, 9.99818303e-01],
       [2.39556523e-11, 1.00000000e+00],
       [3.65972493e-09, 9.99999996e-01],
       [2.33391084e-12, 1.00000000e+00],
       [3.99941572e-01, 6.00058428e-01],
       [4.44829359e-14, 1.00000000e+00],
       [9.19575527e-12, 1.00000000e+00],
       [3.14452908e-11, 1.00000000e+00],
       [4.46939434e-09, 9.99999996e-01],
       [4.63568443e-11, 1.00000000e+00],
       [1.07973926e-10, 1.00000000e+00],
       [5.86532304e-11, 1.00000000e+00],
       [8.53473588e-11, 1.00000000e+00],
       [8.61266614e-12, 1.00000000e+00],
       [5.739853

In [134]:
finalpred.shape

(105, 2)

In [135]:
final_pred = np.array([])
for i in range(0,len(test_X)):
    temp=finalpred[i]
    c=np.where(temp==np.max(temp))
    final_pred=np.append(final_pred,c)   

In [136]:
final_pred

array([1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0.,
       1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0.,
       1., 1., 1.])

In [137]:
accuracy_score(test_y,final_pred) 

  score = y_true == y_pred


0.0

### （3）加权平均法

In [64]:
finalpred=(pred1*0.1+pred2*0.5+pred3*0.4)
finalpred

array([[1.12064690e-07, 9.99999888e-01],
       [1.34730449e-11, 1.00000000e+00],
       [1.88375449e-10, 1.00000000e+00],
       [6.98808346e-01, 3.01191654e-01],
       [2.97515841e-06, 9.99997025e-01],
       [2.02862132e-04, 9.99797138e-01],
       [5.61883952e-06, 9.99994381e-01],
       [1.00000000e-01, 9.00000000e-01],
       [7.85238541e-13, 1.00000000e+00],
       [1.38937306e-10, 1.00000000e+00],
       [1.00001104e-01, 8.99998896e-01],
       [8.63443578e-01, 1.36556422e-01],
       [8.99970849e-01, 1.00029151e-01],
       [2.18713048e-11, 1.00000000e+00],
       [2.65361066e-12, 1.00000000e+00],
       [4.65760763e-13, 1.00000000e+00],
       [6.90832395e-01, 3.09167605e-01],
       [1.06499369e-08, 9.99999989e-01],
       [4.00123758e-01, 5.99876242e-01],
       [1.00000000e-01, 9.00000000e-01],
       [1.77635684e-16, 1.00000000e+00],
       [8.26005930e-15, 1.00000000e+00],
       [3.52130483e-08, 9.99999965e-01],
       [1.00979047e-10, 1.00000000e+00],
       [3.457359

In [65]:
final_pred = np.array([])
for i in range(0,len(test_X)):
    temp=finalpred[i]
    c=np.where(temp==np.max(temp))
    final_pred=np.append(final_pred,c) 

In [66]:
final_pred

array([1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 1., 0.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0.,
       1., 1.])

In [47]:
accuracy_score(test_y,final_pred)

0.9722222222222222

## 高级集成技术

### （1）堆叠（Stacking）

In [138]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
stacking_model= StackingClassifier(estimators=[('决策树模型', model1), ('KNN', model2),('逻辑回归', model2)], 
                                   final_estimator=LogisticRegression(max_iter=10000))

In [139]:
stacking_model.fit(train_X,train_y)

StackingClassifier(estimators=[('决策树模型',
                                DecisionTreeClassifier(random_state=123)),
                               ('KNN', KNeighborsClassifier()),
                               ('逻辑回归', KNeighborsClassifier())],
                   final_estimator=LogisticRegression(max_iter=10000))

In [140]:
stacking_model.score(test_X,test_y)

0.9809523809523809

### （2）bagging

In [154]:
from sklearn.ensemble import BaggingClassifier
bagging_model= BaggingClassifier(base_estimator=model1,n_estimators=15)

In [155]:
bagging_model.fit(train_X,train_y)

BaggingClassifier(base_estimator=DecisionTreeClassifier(random_state=123),
                  n_estimators=15)

In [156]:
bagging_model.score(test_X,test_y)

0.9809523809523809

### （3）adaboost

In [157]:
from sklearn.ensemble import AdaBoostClassifier
adaboost_model=AdaBoostClassifier(base_estimator=model1,n_estimators=15)

In [158]:
adaboost_model.fit(train_X,train_y)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(random_state=123),
                   n_estimators=15)

In [159]:
adaboost_model.score(test_X,test_y)

0.9428571428571428

### (4) 随机森林

In [162]:
from sklearn.ensemble import RandomForestClassifier
RF_model=RandomForestClassifier(n_estimators=15)

In [163]:
RF_model.fit(train_X,train_y)

RandomForestClassifier(n_estimators=15)

In [164]:
RF_model.score(test_X,test_y)

0.9809523809523809