In [1]:
import numpy as np
from sklearn.datasets import load_svmlight_file

# ==============================================================================
# You must have to unconpress the rcv1rcv2aminigoutte.tar.gz to create
# rcv1rcv2aminigoutte path with the files
#===============================================================================

# Categories of documents
categories = {'C15': 4587, 'CCAT': 8745, 'E21': 9625, 'ECAT': 5656, 'GCAT': 8745, 'M11': 45845}

# Convert target from string float target in file 
def filter_datas(file, target_file):
    with open(file, 'r') as document_read:
        with open(target_file, 'w') as document_write:
            for line in document_read:
                target = line.split(None, 1)[0]
                line_to_write = '%s %s' % (categories[target], line[len(target)+1:len(line)])
                document_write.write(line_to_write)
        
    return target_file

# Loading datasets in svmlight format.
file = filter_datas('rcv1rcv2aminigoutte/EN/Index_EN-EN', 'svml_en_en.txt')
X,  y = load_svmlight_file(file)

In [2]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

# ==============================================================================
# Applying feature selection
# ==============================================================================
clf = ExtraTreesClassifier()
clf = clf.fit(X, y)
model = SelectFromModel(clf, prefit=True)
X_new = model.transform(X)
print('Data matrix shape before feature selection: ' + str(X.shape))
print('Data matrix shape after feature selection: ' + str(X_new.shape))

Data matrix shape before feature selection: (18758, 21531)
Data matrix shape after feature selection: (18758, 3008)


In [3]:
# ==============================================================================
# Splitting datasets
# ==============================================================================
print('Splitting dataset to train and test....')
X_train, X_test, y_train, y_test = train_test_split(
    X_new,
    y,
    test_size=0.25,
    random_state=42,
    stratify=y
)

Splitting dataset to train and test....


In [4]:
# ==============================================================================
# Finding the best K
# ==============================================================================
print('Finding the best k for number of neighbors...')
k_scores = []
for k in range(1, 11):
    print('For K = ' + str(k) + '.....')
    print('Fitting KNN classifier....')
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)

    # ==============================================================================
    # Getting "model" accuracy.
    # "model" because KNN doesn't explicity learn a model. This classifier chooses
    # to memorizes the training instances which are subsequently used as "knowledge"
    # on prediction phase (instance-based learning algorithm)
    # ==============================================================================
    print('Getting scores....')
    knn_score = knn.score(X_test, y_test)
    print('Score for k = ' + str(k) + ': ' + str(knn_score))
    k_scores.append(knn_score)

Finding the best k for number of neighbors...
For K = 1.....
Fitting KNN classifier....
Getting scores....
Score for k = 1: 0.7628997867803838
For K = 2.....
Fitting KNN classifier....
Getting scores....
Score for k = 2: 0.7528784648187633
For K = 3.....
Fitting KNN classifier....
Getting scores....
Score for k = 3: 0.7200426439232409
For K = 4.....
Fitting KNN classifier....
Getting scores....
Score for k = 4: 0.7191897654584222
For K = 5.....
Fitting KNN classifier....
Getting scores....
Score for k = 5: 0.7017057569296375
For K = 6.....
Fitting KNN classifier....
Getting scores....
Score for k = 6: 0.6997867803837953
For K = 7.....
Fitting KNN classifier....
Getting scores....
Score for k = 7: 0.691044776119403
For K = 8.....
Fitting KNN classifier....
Getting scores....
Score for k = 8: 0.6993603411513859
For K = 9.....
Fitting KNN classifier....
Getting scores....
Score for k = 9: 0.6895522388059702
For K = 10.....
Fitting KNN classifier....
Getting scores....
Score for k = 10: 0.

In [None]:
np_k_scores = np.asarray(k_scores)
best_k = np.argmax(np_k_scores) + 1
print('Best k to KNN Classifier: ' + str(best_k))
knn = KNeighborsClassifier(n_neighbors=best_k)
knn.fit(X_train, y_train)
print('Getting scores....')
y_pred = knn.predict(X_test)

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

print('Getting classification report....')
confusion = confusion_matrix(y_test, y_pred)
print(confusion)
report = classification_report(y_test, y_pred)
print(report)