In [3]:
import numpy as np
import glob

# ==============================================================================
# CONSTANTS
# ==============================================================================
EN_DOCS = 18758 # number of samples of english documents
EN_TOKENS = 21531 + 1 # number of features of english documents

# Functions
def createArray (numRows, numCols, document):
    datas = {} 
    datas['data'] =  np.zeros(shape=(EN_DOCS, EN_TOKENS), dtype='float32')
    aux_list = []
    for (index, value) in enumerate(document):
        doc_splited = value.split(' ')
        doc_splited.remove('\n') # remove end of line character
        for y in doc_splited:
            splited = y.split(':')
            if len(splited) == 1:
                aux_list.append(splited[0])
            else:
                datas['data'][index][int(splited[0])] = float(splited[1])
    datas['target'] = np.asarray(aux_list)
    return datas

# ==============================================================================
# Here we can import all files from each path with glob lib using string matcher
# If we have opened more than one file we need to map our file_names and create
# ==============================================================================
# You must have to unconpress the rcv1rcv2aminigoutte.tar.gz to create
# rcv1rcv2aminigoutte path with the files
#===============================================================================
file_names = glob.glob('rcv1rcv2aminigoutte/EN/Index_EN-EN')
with open(file_names[0]) as document:
    file_read = document.readlines()
    
# ==============================================================================
# ML
# ==============================================================================
datas = createArray(EN_DOCS, EN_TOKENS, file_read)
del(file_read)
document.close()

In [4]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
# ==============================================================================
# Getting feature and data values
# ==============================================================================
y = datas['target']
X = datas['data']
del(datas)

# ==============================================================================
# Applying feature selection
# ==============================================================================
clf = ExtraTreesClassifier()
clf = clf.fit(X, y)
model = SelectFromModel(clf, prefit=True)
X_new = model.transform(X)
print('Data matrix shape before feature selection: ' + str(X.shape))
print('Data matrix shape after feature selection: ' + str(X_new.shape))

# ==============================================================================
# Splitting datasets
# ==============================================================================
knn = KNeighborsClassifier(n_neighbors=6)
print('Splitting dataset to train and test ones....')
X_train, X_test, y_train, y_test = train_test_split(X_new,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=42,
                                                    stratify=y)

del(X)
del(X_new)

# ==============================================================================
# Fitting KNN classifier
# ==============================================================================
print('Fitting KNN classifier....')
knn.fit(X_train, y_train)

# ==============================================================================
# Getting "model" accuracy.
# "model" because KNN doesn't explicity learn a model. This classifier chooses
# to memorizes the training instances which are subsequently used as "knowledge"
# on prediction phase (instance-based learning algorithm)
# ==============================================================================
print('Getting scores....')
knn_score = knn.score(X_test, y_test)
print(knn_score)

Data matrix shape before feature selection: (18758, 21532)
Data matrix shape after feature selection: (18758, 3022)
Splitting dataset to train and test ones....
Fitting KNN classifier....
Getting scores....
0.5488272921108742
