In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_svmlight_file
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [2]:
data = load_svmlight_file('data/demo-g3.txt')
X,y=data[0],data[1]
X_s,y_s=resample(X,y,n_samples=150000,replace=False)
X_train,X_test,y_train,y_test = train_test_split(X_s,y_s,test_size=.3,random_state=42)

In [7]:
rfc = RandomForestClassifier(n_estimators=300,max_depth=None,n_jobs=-1,verbose=1)

In [8]:
rfc.fit(X_train,y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 22.7min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 37.9min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=-1,
            oob_score=False, random_state=None, verbose=1,
            warm_start=False)

In [9]:
y_test_pred=rfc.predict(X_test)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    2.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   11.5s
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:   20.1s finished


In [10]:
y_train_pred=rfc.predict(X_train)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    4.4s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   18.9s
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:   30.4s finished


In [14]:
from sklearn.metrics import precision_score, precision_recall_curve, recall_score, roc_curve, auc, confusion_matrix
import numpy as np

In [20]:
def model_eva(model,X_test,y_test):
    y_test_pred=model.predict(X_test)
    print('precision_score:',precision_score(y_test,y_test_pred,average='macro'))
    print(confusion_matrix(y_test,y_test_pred).T)
    print(np.unique(y_test,return_counts=True))
    print(np.unique(y_test_pred,return_counts=True))

In [21]:
model_eva(rfc,X_test,y_test)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    2.8s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   12.4s


precision_score: 0.15607332841392643
[[1456  980  899  576  494  385  295  224  145  110  178  262]
 [ 457  416  377  247  247  157  113   83   58   41   73   84]
 [ 722  593 1118  936  755  583  404  279  188  146  226  133]
 [ 331  335  930 1169 1035  752  490  345  249  172  229   70]
 [ 334  266  750 1036 1139  962  707  468  335  233  345   81]
 [ 217  195  471  609  818  747  615  424  312  233  366   69]
 [  80   84  182  255  338  321  258  229  154  148  205   15]
 [  50   49  109  128  165  204  180  142  155  117  178   10]
 [  33   35   66   78  110   99   93   87   77   64  128    6]
 [  32   15   48   62   83   83   85   58   69   58  113    3]
 [ 136  118  288  328  479  588  676  752  741  799 2169   29]
 [  53   34   46   21   17   28   13   11    9    5   19   21]]
(array([ 4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13., 14., 15.]), array([3901, 3120, 5284, 5445, 5680, 4909, 3929, 3102, 2492, 2126, 4229,
        783]))
(array([ 4.,  5.,  6.,  7.,  8.,  9., 10., 11., 

[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:   19.3s finished


In [24]:
rfc_28 = RandomForestClassifier(n_estimators=3000,max_depth=10,n_jobs=-1,verbose=1)
rfc_28.fit(X_train,y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   15.0s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   33.2s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:   57.0s
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 3000 out of 3000 | elapsed:  4.4min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=3000, n_jobs=-1,
            oob_score=False, random_state=None, verbose=1,
            warm_start=False)

In [25]:
y_test_pred=rfc_28.predict(X_test)
print(precision_score(y_test,y_test_pred,average='macro'))
print(precision_score(y_test,y_test_pred,average='micro'))
print(confusion_matrix(y_test,y_test_pred).T)
print(np.unique(y_test,return_counts=True))
print(np.unique(y_test_pred,return_counts=True))

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.2s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    2.8s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    5.1s
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:    8.4s
[Parallel(n_jobs=4)]: Done 1792 tasks      | elapsed:   14.5s
[Parallel(n_jobs=4)]: Done 2442 tasks      | elapsed:   21.8s


0.12177835127795104
0.18873333333333334
[[ 482  296  159   89   81   58   35   35   24   16   47   86]
 [   0    0    0    0    0    0    0    0    0    0    0    0]
 [2497 1930 2582 1953 1729 1232  827  585  382  311  444  513]
 [  51   73  641  841  648  396  276  156   90   55   66    6]
 [ 827  696 1741 2378 2944 2751 2217 1774 1441 1176 2043  252]
 [   4    1   13   36   41   67   55   34   23   21   17    0]
 [   0    0    0    0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0    0    0]
 [  51   24  106  146  319  402  522  513  504  585 1577    7]
 [   0    0    0    0    0    0    0    0    0    0    0    0]]
(array([ 4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13., 14., 15.]), array([3912, 3020, 5242, 5443, 5762, 4906, 3932, 3097, 2464, 2164, 4194,
        864]))
(array([ 4.,  6.,  7.,  8.,  9., 14.]), array

[Parallel(n_jobs=4)]: Done 3000 out of 3000 | elapsed:   26.2s finished
  'precision', 'predicted', average, warn_for)


In [22]:
rfc.feature_importances_.shape

(18338,)

In [23]:
index = np.arange(18338)
value=zip(index,rfc.feature_importances_)

In [24]:
l=sorted(value,key=lambda x:x[1],reverse=True)

In [38]:
r=l[:100]

In [39]:
mat_index=map(lambda x:x[0],r)

In [40]:
X_s_compact=X_s[:,list(mat_index)]

In [41]:
X_train,X_test,y_train,y_test = train_test_split(X_s_compact,y_s,test_size=.3,random_state=42)

In [42]:
X_train.shape

(105000, 100)

In [30]:
rfc_1000_10 = RandomForestClassifier(n_estimators=1000,max_depth=10,n_jobs=-1,verbose=1)
rfc_1000_10.fit(X_train,y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   33.6s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:  3.0min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
            oob_score=False, random_state=None, verbose=1,
            warm_start=False)

In [31]:
model_eva(rfc_1000_10,X_train,y_train)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.8s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    4.7s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:   10.1s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:   13.6s finished


precision_score: 0.6136882636138721
[[5530 3261 3036 2169 1902 1528 1035  743  541  407  748 1030]
 [   6  312    9    2    8    4    2    3    2    1    3    4]
 [2019 1971 4469 2795 2485 2038 1440 1022  731  584  871  444]
 [ 224  284 1329 2970 1278 1062  739  508  323  196  301   20]
 [ 692  763 2145 2690 5031 2803 2518 1922 1428 1171 1554  155]
 [ 151  150  345  369  331 1734  467  421  360  280  301  143]
 [   3    1    5    4    3    0  328    5    7    7    0    0]
 [   0    0    1    0    0    0    0  141    0    0    0    0]
 [   0    0    0    1    0    0    0    0   79    0    0    0]
 [   0    0    0    0    0    0    0    0    0   62    0    0]
 [ 350  338 1155 1727 2144 2493 2433 2510 2353 2371 6073   88]
 [   0    0    0    0    0    0    0    0    0    0    0    5]]
(array([ 4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13., 14., 15.]), array([ 8975,  7080, 12494, 12727, 13182, 11662,  8962,  7275,  5824,
        5079,  9851,  1889]))
(array([ 4.,  5.,  6.,  7.,  8.,  9.,

In [32]:
model_eva(rfc_1000_10,X_test,y_test)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.8s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    2.2s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    3.7s


precision_score: 0.2099854730159717
[[2189 1532 1460  967  875  621  454  348  225  168  305  415]
 [  20   16    7    6    7    3    3    3    0    0    3    1]
 [ 981  864 1514 1275 1144  898  613  408  343  244  385  192]
 [ 108  161  660  813  704  516  340  230  158  110  115   18]
 [ 329  315  988 1382 1574 1399 1079  816  641  457  778   69]
 [  89   75  154  252  299  283  268  187  138  135  185   54]
 [   2    2    3    4   10   13    9    7    4    3    1    0]
 [   0    0    0    0    1    0    0    0    1    0    0    0]
 [   0    0    0    0    0    0    0    2    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    1    0    0]
 [ 183  155  498  746 1066 1176 1163 1101  982 1008 2457   34]
 [   0    0    0    0    0    0    0    0    0    0    0    0]]
(array([ 4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13., 14., 15.]), array([3901, 3120, 5284, 5445, 5680, 4909, 3929, 3102, 2492, 2126, 4229,
        783]))
(array([ 4.,  5.,  6.,  7.,  8.,  9., 10., 11., 1

[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    4.6s finished
  'precision', 'predicted', average, warn_for)


In [47]:
data = load_svmlight_file('data/demo-g3-v2.txt')
X,y=data[0],data[1]
X_s,y_s=resample(X,y,n_samples=150000,replace=False)
X_train,X_test,y_train,y_test = train_test_split(X_s,y_s,test_size=.3,random_state=42)

In [52]:
rfc_v2 = RandomForestClassifier(n_estimators=300,n_jobs=-1,verbose=1)
rfc_v2.fit(X_train,y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 33.1min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 46.4min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=-1,
            oob_score=False, random_state=None, verbose=1,
            warm_start=False)

In [53]:
model_eva(rfc_v2,X_train,y_train)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    5.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   23.5s
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:   45.6s finished


precision_score: 0.8704674434302873
[[ 4387   379   372   263   258   180   139   128    69    59    97]
 [   81  3109   106    77    77    54    54    37    39    19    28]
 [  176   141  7272   229   250   190   150   110    85    77   114]
 [   83    93   239  9751   264   226   165   147    97    83    77]
 [  126   112   223   297 11342   248   219   160   109   112   129]
 [   84    81   190   203   220 11293   191   141    97    90   134]
 [   58    49   117   140   148   150  9353   108    82    66   119]
 [   30    30    54    83    90   106    72  8102    62    49    73]
 [   17    20    33    41    51    36    47    34  6715    41    44]
 [   18    17    31    36    56    42    43    44    38  6199    54]
 [   69    63   164   183   219   279   275   305   275   294 14244]]
(array([ 4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13., 14.]), array([ 5129,  4094,  8801, 11303, 12975, 12804, 10708,  9316,  7668,
        7089, 15113]))
(array([ 4.,  5.,  6.,  7.,  8.,  9., 10., 11.

In [54]:
model_eva(rfc_v2,X_test,y_test)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    2.8s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   12.1s
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:   19.6s finished


precision_score: 0.1523866992741791
[[ 541  379  402  317  284  248  166  151  114   77  118]
 [ 175  115  160  141  130   84   58   56   39   32   49]
 [ 279  243  483  461  429  381  274  174  148  102  158]
 [ 249  199  626  870  801  689  505  326  235  185  268]
 [ 271  260  654  901 1045  954  697  514  381  289  375]
 [ 226  165  489  674  964  937  763  565  428  319  480]
 [ 120   85  267  333  527  566  439  366  268  217  342]
 [  70   51  146  243  319  318  272  247  214  182  293]
 [  36   36   73  113  151  166  139  133  110  116  177]
 [  43   26   62   89  117  137  114  116  114   95  223]
 [ 180  164  442  607  894 1109 1135 1288 1260 1383 3995]]
(array([ 4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13., 14.]), array([2190, 1723, 3804, 4749, 5661, 5589, 4562, 3936, 3311, 2997, 6478]))
(array([ 4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13., 14.]), array([ 2797,  1039,  3132,  4953,  6341,  6010,  3530,  2355,  1250,
        1136, 12457]))


In [55]:
from sklearn.ensemble import GradientBoostingClassifier

In [56]:
gbc_clf = GradientBoostingClassifier(n_estimators=100,max_depth=2)

In [57]:
gbc_clf.fit(X_train,y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=2,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [58]:
model_eva(gbc_clf,X_test,y_test)

precision_score: 0.17144125852766898
[[ 721  491  502  403  352  315  204  179  116   87  132]
 [  17   10   11    7    6    3    1    2    1    0    4]
 [ 133  116  264  246  240  179  148  105   88   77  115]
 [ 154  156  572  809  736  576  368  218  138   93   91]
 [ 627  497 1222 1548 1726 1588 1179  849  650  547  817]
 [ 120  105  382  628  897  986  770  623  461  365  611]
 [   7    5   19   39   60   72   73   77   51   38   35]
 [   3    3    6   14   15   25   25   27   16   12   27]
 [   2    2    5   11    9    9   15    9   11    8   19]
 [   1    0    0    3    4    6    5    2    7    8   14]
 [ 405  338  821 1041 1616 1830 1774 1845 1772 1762 4613]]
(array([ 4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13., 14.]), array([2190, 1723, 3804, 4749, 5661, 5589, 4562, 3936, 3311, 2997, 6478]))
(array([ 4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13., 14.]), array([ 3502,    62,  1711,  3911, 11250,  5948,   476,   173,   100,
          50, 17817]))
