In [1]:
import numpy as np
import glob
import matplotlib as m
# ==============================================================================
# CONSTANTS
# ==============================================================================
EN_DOCS = 18758 # number of samples of english documents
EN_TOKENS = 21531 + 1 # number of features of english documents

# Functions
def create_dataframe (numRows, numCols, document):
    datas = {} 
    datas['data'] =  np.zeros(shape=(EN_DOCS, EN_TOKENS), dtype='float32')
    aux_list = []
    for (index, value) in enumerate(document):
        doc_splited = value.split(' ')
        doc_splited.remove('\n') # remove end of line character
        for y in doc_splited:
            splited = y.split(':')
            if len(splited) == 1:
                aux_list.append(splited[0])
            else:
                datas['data'][index][int(splited[0])] = float(splited[1])
    datas['target'] = np.asarray(aux_list)
    return datas

# ==============================================================================
# Here we can import all files from each path with glob lib using string matcher
# If we have opened more than one file we need to map our file_names and create
# ==============================================================================
# You must have to unconpress the rcv1rcv2aminigoutte.tar.gz to create
# rcv1rcv2aminigoutte path with the files
#===============================================================================
file_names = glob.glob('rcv1rcv2aminigoutte/EN/Index_EN-EN')
with open(file_names[0]) as document:
    file_read = document.readlines()
    
# ==============================================================================
# ML
# ==============================================================================
datas = create_dataframe(EN_DOCS, EN_TOKENS, file_read)
del(file_read)
document.close()

In [2]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
# ==============================================================================
# Getting feature and data values
# ==============================================================================
y = datas['target']
X = datas['data']
del(datas)

# ==============================================================================
# Applying feature selection
# ==============================================================================
clf = ExtraTreesClassifier()
clf = clf.fit(X, y)
model = SelectFromModel(clf, prefit=True)
X_new = model.transform(X)
print('Data matrix shape before feature selection: ' + str(X.shape))
print('Data matrix shape after feature selection: ' + str(X_new.shape))

Data matrix shape before feature selection: (18758, 21532)
Data matrix shape after feature selection: (18758, 3004)


In [3]:
# ==============================================================================
# Splitting datasets
# ==============================================================================
print('Splitting dataset to train and test....')
X_train, X_test, y_train, y_test = train_test_split(
    X_new,
    y,
    test_size=0.25,
    random_state=42,
    stratify=y
)

del(X)
del(X_new)

Splitting dataset to train and test....


In [6]:
# ==============================================================================
# Finding the best K
# ==============================================================================
print('Finding the best k for number of neighbors...')
k_scores = []
for k in range(1, 11):
    print('For K = ' + str(k) + '.....')
    print('Fitting KNN classifier....')
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)

    # ==============================================================================
    # Getting "model" accuracy.
    # "model" because KNN doesn't explicity learn a model. This classifier chooses
    # to memorizes the training instances which are subsequently used as "knowledge"
    # on prediction phase (instance-based learning algorithm)
    # ==============================================================================
    print('Getting scores....')
    knn_score = knn.score(X_test, y_test)
    print('Score for k = ' + str(k) + ': ' + str(knn_score))
    k_scores.append(knn_score)

Finding the best k for number of neighbors...
For K = 1.....
Fitting KNN classifier....
Getting scores....
Score for k = 1: 0.6443496801705757
For K = 2.....
Fitting KNN classifier....
Getting scores....
Score for k = 2: 0.5479744136460555
For K = 3.....
Fitting KNN classifier....
Getting scores....
Score for k = 3: 0.5831556503198294
For K = 4.....
Fitting KNN classifier....
Getting scores....
Score for k = 4: 0.5889125799573561
For K = 5.....
Fitting KNN classifier....
Getting scores....
Score for k = 5: 0.5867803837953092
For K = 6.....
Fitting KNN classifier....
Getting scores....
Score for k = 6: 0.5874200426439232
For K = 7.....
Fitting KNN classifier....
Getting scores....
Score for k = 7: 0.5914712153518124
For K = 8.....
Fitting KNN classifier....
Getting scores....
Score for k = 8: 0.5859275053304904
For K = 9.....
Fitting KNN classifier....
Getting scores....
Score for k = 9: 0.591684434968017
For K = 10.....
Fitting KNN classifier....
Getting scores....
Score for k = 10: 0.

In [7]:
np_k_scores = np.asarray(k_scores)
best_k = np.argmax(np_k_scores) + 1
print('Best k to KNN Classifier: ' + str(best_k))
knn = KNeighborsClassifier(n_neighbors=best_k)
knn.fit(X_train, y_train)
print('Getting scores....')
y_pred = knn.predict(X_test)


from sklearn.metrics import classification_report

print('Getting classification report....')
report = classification_report(y_test, y_pred)
print(report)

Best k to KNN Classifier: 1
Getting scores....
Getting classification report....
             precision    recall  f1-score   support

        C15       0.61      0.91      0.73      1276
       CCAT       0.67      0.34      0.45      1083
        E21       0.88      0.54      0.67       308
       ECAT       0.79      0.45      0.57       514
       GCAT       0.60      0.80      0.69      1207
        M11       0.95      0.46      0.62       302

avg / total       0.68      0.64      0.62      4690

