In [22]:
import numpy as np
import os
from sklearn.cluster import KMeans
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from scipy.sparse import csr_matrix

In [4]:
def load_data(data_path, dataset):
    with open(data_path + '20news_' + dataset + '_tfidf.txt') as f:
        labels = []
        tf_idf = []
        doc_idx = []
        vocab_idx = []
        lines = f.read().splitlines()
        dataset_size = len(lines)
        for line_idx, line in enumerate(lines):
            features = line.split('<fff>')
            labels.append(int(features[0]))
            for word in features[2].split():
                tf_idf.append(float(word.split(':')[1]))
                doc_idx.append(line_idx)
                vocab_idx.append(int(word.split(':')[0]))
    with open(data_path + 'words_idfs.txt') as f:
        vocab_size = len(f.read().splitlines())
    return np.array(labels), np.array(tf_idf), np.array(doc_idx), np.array(vocab_idx), dataset_size, vocab_size

# K Means

In [5]:
def clustering_with_KMeans(path):
    _, tf_idf, doc_idx, vocab_idx, dataset_size, vocab_size = load_data(path, dataset = 'full')

    X = csr_matrix((tf_idf, (doc_idx, vocab_idx)), shape = (dataset_size, vocab_size))
    print('--------------')
    kmeans = KMeans(
        n_clusters = 20,
        init = 'random',
        n_init = 5, #differently init centroids
        tol = 1e-3, #threshold
        random_state = 2018 #seed
    ).fit(X)
    return kmeans
    

In [6]:
path = '..\\datasets\\20news-bydate\\'
k_means = clustering_with_KMeans(path = path)

--------------


In [7]:
k_means.cluster_centers_

array([[0.00000000e+00, 0.00000000e+00, 4.20829291e-04, ...,
        6.77217674e-04, 5.21527356e-05, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        7.07271997e-04, 5.50333086e-05, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        8.10727782e-04, 7.08597261e-05, 0.00000000e+00],
       ...,
       [2.07833837e-04, 7.97894043e-04, 6.95846695e-05, ...,
        5.36674538e-04, 4.69161363e-05, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 1.03462005e-03, ...,
        7.90950011e-04, 5.96664713e-05, 0.00000000e+00],
       [0.00000000e+00, 1.07023418e-03, 0.00000000e+00, ...,
        7.62676436e-04, 5.96495460e-05, 0.00000000e+00]])

# Linear SVM

In [19]:
def compute_accuracy(y_predicted, y_expected):
    matches = np.equal(y_predicted, y_expected)
    accuracy = np.sum(matches.astype('float')) / y_expected.size
    return accuracy

In [23]:
def classifying_with_linear_SVMs(path):
    y_train, tf_idf, doc_idx, vocab_idx, dataset_size, vocab_size = load_data(path, dataset = 'train')
    X_train = csr_matrix((tf_idf, (doc_idx, vocab_idx)), shape = (dataset_size, vocab_size))

    classifier = LinearSVC(
        C = 10.0,
        tol = 0.001,
    ).fit(X_train, y_train)
    
    y_test, tf_idf, doc_idx, vocab_idx, dataset_size, vocab_size = load_data(path, dataset = 'test')
    X_test = csr_matrix((tf_idf, (doc_idx, vocab_idx)), shape = (dataset_size, vocab_size))
    y_predicted = classifier.predict(X_test)
    accuracy = compute_accuracy(y_predicted = y_predicted, y_expected = y_test)
    print('accuracy: ', accuracy)

In [24]:
classifying_with_linear_SVMs(path)

accuracy:  0.8224907063197026


# Kernel SVM

In [26]:
def classifying_with_kernel_SVMs(path):
    y_train, tf_idf, doc_idx, vocab_idx, dataset_size, vocab_size = load_data(path, dataset = 'train')
    X_train = csr_matrix((tf_idf, (doc_idx, vocab_idx)), shape = (dataset_size, vocab_size))

    classifier = SVC(
        C = 50.0,
        kernel = 'rbf',
        gamma = 0.1,
        tol = 0.001,
    ).fit(X_train, y_train)
    
    y_test, tf_idf, doc_idx, vocab_idx, dataset_size, vocab_size = load_data(path, dataset = 'test')
    X_test = csr_matrix((tf_idf, (doc_idx, vocab_idx)), shape = (dataset_size, vocab_size))
    y_predicted = classifier.predict(X_test)
    accuracy = compute_accuracy(y_predicted = y_predicted, y_expected = y_test)
    print('accuracy: ', accuracy)

In [27]:
classifying_with_kernel_SVMs(path)

accuracy:  0.8194370685077005
