## Creating function to filter the data

In [1]:
import numpy as np
from sklearn.datasets import load_svmlight_file

# ==============================================================================
# You must have to unconpress the rcv1rcv2aminigoutte.tar.gz to create
# rcv1rcv2aminigoutte path with the files
#===============================================================================

# Categories of documents
categories = {'C15': 4587, 'CCAT': 8745, 'E21': 9625, 'ECAT': 5656, 'GCAT': 8745, 'M11': 45845}

# Convert target from string float target in file 
def filter_datas(file, target_file):
    with open(file, 'r') as document_read:
        with open(target_file, 'w') as document_write:
            for line in document_read:
                target = line.split(None, 1)[0]
                line_to_write = '%s %s' % (categories[target], line[len(target)+1:len(line)])
                document_write.write(line_to_write)
        
    return target_file

# Loading datasets in svmlight format.
file = filter_datas('rcv1rcv2aminigoutte/EN/Index_EN-EN', 'svml_en_en.txt')
x,  y = load_svmlight_file(file)

### Selecting important variables for model building

* Um dos benefícios do RandomForestClassifier é que se pode trabalhar com muitos dados e pode reduzir a dimensionalidade, pois ele identifica as variáveis mais importantes. 


In [3]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

model = RandomForestClassifier()
model.fit(x, y)

featimp = pd.Series(model.feature_importances_).sort_values(ascending=False)
print(featimp)

59       0.014113
28       0.013417
79       0.010513
103      0.010377
54       0.009870
102      0.009530
18       0.009480
165      0.009005
142      0.008308
10       0.008136
158      0.007486
270      0.006930
25       0.006391
26       0.006285
224      0.006145
14       0.005928
189      0.005837
8        0.005807
216      0.005295
7        0.005196
34       0.005177
3        0.005131
650      0.004905
1        0.004756
80       0.004736
24       0.004667
94       0.004570
98       0.004533
211      0.004509
498      0.004481
           ...   
13042    0.000000
13043    0.000000
13044    0.000000
13045    0.000000
13050    0.000000
13018    0.000000
13051    0.000000
13052    0.000000
13055    0.000000
13059    0.000000
13060    0.000000
13061    0.000000
13038    0.000000
13036    0.000000
13035    0.000000
13034    0.000000
13033    0.000000
13032    0.000000
13031    0.000000
13030    0.000000
13029    0.000000
13028    0.000000
13027    0.000000
13025    0.000000
13024    0

In [4]:
from scipy import sparse

A = sparse.coo_matrix(x, y)
print(A)

  (0, 0)	-3.818658
  (0, 1)	0.922576
  (0, 4)	1.294301
  (0, 16)	2.467734
  (0, 17)	2.540152
  (0, 19)	2.586254
  (0, 24)	2.404756
  (0, 49)	3.308865
  (0, 51)	3.0313
  (0, 72)	3.266316
  (0, 76)	3.624791
  (0, 82)	3.66837
  (0, 101)	3.750902
  (0, 109)	3.825625
  (0, 119)	3.925834
  (0, 134)	4.013698
  (0, 145)	3.763491
  (0, 160)	4.230347
  (0, 169)	4.266229
  (0, 172)	4.270767
  (0, 191)	3.706258
  (0, 195)	4.353002
  (0, 234)	4.542887
  (0, 236)	3.470596
  (0, 252)	4.619306
  :	:
  (18757, 3651)	5.689319
  (18757, 4026)	5.820831
  (18757, 4294)	5.893602
  (18757, 4297)	5.893602
  (18757, 4563)	5.972013
  (18757, 4729)	6.035087
  (18757, 4833)	6.049433
  (18757, 4854)	6.057017
  (18757, 5036)	6.102357
  (18757, 5537)	6.25203
  (18757, 5866)	6.336096
  (18757, 6080)	6.396297
  (18757, 6401)	6.460316
  (18757, 6466)	6.493915
  (18757, 6502)	6.493915
  (18757, 7613)	6.723052
  (18757, 7794)	6.752754
  (18757, 8233)	6.860645
  (18757, 9117)	7.011233
  (18757, 10402)	7.209414
  (18757, 1

In [5]:
import matplotlib.pyplot as plt
from scipy.sparse import coo_matrix

def plot_coo_matrix(m):
    if not isinstance(m, coo_matrix):
        m = coo_matrix(m)
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(m.col, m.row, 's', color='black', ms=1)
    ax.set_xlim(0, m.shape[1])
    ax.set_ylim(0, m.shape[0])
    ax.set_aspect('equal')
    for spine in ax.spines.values():
        spine.set_visible(False)
    ax.invert_yaxis()
    ax.set_aspect('equal')
    ax.set_xticks([])
    ax.set_yticks([])
    return ax
    

shape = (100000, 100000)
rows = np.int_(np.round_(shape[0]*np.random.random(1000)))
cols = np.int_(np.round_(shape[1]*np.random.random(1000)))
vals = np.ones_like(rows)

plot_coo_matrix(A)

<matplotlib.axes._subplots.AxesSubplot at 0x7f0c9d77afd0>