# 搭建randomforest分类器，对数字进行识别

<font color =red>数据挖掘中，特征工程处理十分重要，但是此次手写识别已经将像素排列好，所以直接对数据标准化后处理即可

In [1]:
import pandas as pd
import numpy as np
from sklearn import ensemble
from sklearn import preprocessing
import matplotlib as mpl
import matplotlib.pyplot as plt
#读取输入数据
train_data = pd.read_csv("data/train.csv")


In [2]:
mpl.rcParams['font.sans-serif'] = ['SimHei']
mpl.rcParams['font.family']='sans-serif'

预处理：需要对像素进行normlization

In [3]:
train_data_label = train_data[['label']]
train_data_pixel = train_data.drop('label',axis =1)

In [4]:
#将像素的标准值和方差标准化为 0和1
scaler_train_pixel = preprocessing.scale(train_data_pixel,axis=1)

<font color =red>建立随机森林模型并进行训练

In [5]:
from sklearn.model_selection import learning_curve


#绘制训练曲线
def plot_learning_curve(estimator,title,X,y,ylim=None,cv=None,
                         n_jobs=-1,train_sizes=np.linspace(.1,1.0,5)):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel(u'训练样本数')
    plt.ylabel(u'训练得分')
    train_sizes,train_scores,test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs ,train_sizes=train_sizes,verbose=1 )
    train_scores_mean = np.mean(train_scores,axis =1)
    train_scores_std  =np.std(train_scores,axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()
    
    plt.fill_between(train_sizes,train_scores_mean-train_scores_std,
                     train_scores_mean+train_scores_std,alpha=0.1,color='r')
    plt.fill_between(train_sizes,test_scores_mean-test_scores_std,
                     test_scores_mean+test_scores_std,alpha =0.1,color='g')
    plt.plot(train_sizes,train_scores_mean,'o-',color='r',
             label=u'训练得分')
    plt.plot(train_sizes,test_scores_mean,'o-',color = 'g',
             label=u'交叉验证得分')
    plt.legend(loc='best')
    return plt

In [6]:
def build_random_forest_model(X_train_data,y_train_data,estimator = 'RandomForest',plot = True):
    
    rnf_clf = ensemble.RandomForestClassifier(n_estimators=1000,min_samples_split=5)
    if(plot == True):
        plot_learning_curve(rnf_clf,u'学习曲线',X_train_data,y_train_data)
    rnf_clf.fit(X_train_data,y_train_data)    
    
    return rnf_clf
    

In [7]:
random_forest_clf = build_random_forest_model(train_data_pixel,train_data_label,plot = False)

  


In [5]:
test_data = pd.read_csv('data/test.csv')
scaler_test_pixel =  preprocessing.scale(test_data,axis=1)
num=test_data.shape[0]

In [6]:
test_data_predict = random_forest_clf.predict(test_data)
submission = pd.DataFrame({'ImageId':list(range(1,num+1)),'Label':test_data_predict})

NameError: name 'random_forest_clf' is not defined

In [None]:
submission.to_csv('data/submission_result.csv',index = False ,sep=',')

In [None]:
submission.Label.value_counts()

In [7]:
train_data.label.value_counts()

1    4684
7    4401
3    4351
9    4188
2    4177
6    4137
0    4132
4    4072
8    4063
5    3795
Name: label, dtype: int64

# 最终准确率 96.7％还不算太差

In [8]:
train_data.label.values

array([1, 0, 1, ..., 7, 6, 9], dtype=int64)

In [9]:
y_tr = train_data.drop('label',axis =1).values

In [10]:
y_tr  = y_tr.reshape(-1,28,28,1)

In [11]:
y_tr = y_tr.astype(np.float32)/y_tr.max()

In [12]:
y_tr[1]

array([[[0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ]],

       [[0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        

KeyboardInterrupt: 