# 20newsgroup subset

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups

categories = [
    "alt.atheism",
    "talk.religion.misc",
    "comp.graphics",
    "sci.space",
]

dataset = fetch_20newsgroups(
    remove=("headers", "footers", "quotes"),
    subset="all",
    categories=categories,
    shuffle=True,
    random_state=42,
)

labels = dataset.target
unique_labels, category_sizes = np.unique(labels, return_counts=True)
true_k = unique_labels.shape[0]

print(f"{len(dataset.data)} documents - {true_k} categories")

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from time import time
vectorizer = TfidfVectorizer(
    max_df=0.5,
    min_df=5,
    stop_words="english",
)
t0 = time()
X_tfidf = vectorizer.fit_transform(dataset.data)

print(f"vectorization done in {time() - t0:.3f} s")
print(f"n_samples: {X_tfidf.shape[0]}, n_features: {X_tfidf.shape[1]}")

data = X_tfidf.todense()
labels +=1

## Olivetti_face 

In [None]:
from sklearn.datasets import fetch_olivetti_faces
import pandas as pd

In [None]:
olive = fetch_olivetti_faces(return_X_y=True)
data = olive[0]
labels = olive[1]+1

## RCV1

In [None]:
from sklearn.datasets import fetch_rcv1
import numpy as np
from scipy import ndimage 
import pandas as pd

In [None]:
rcv1 = fetch_rcv1(subset='train', download_if_missing=True)

In [None]:
data = rcv1.data.todense() 
labels = rcv1.target.todense()

In [None]:
def labels_matrix_operation(data,matrix_labels):
    # row and column
    
    c = matrix_labels.shape[1] 
    
    #row with single label
    
    row_sums = np.sum(matrix_labels, axis=1)
    ones_rows_indices = np.where(row_sums == 1)[0]

    #new matrix
    new_matrix = matrix_labels[ones_rows_indices]
    
    #find corresponding data
    new_data = data[ones_rows_indices]
    
    n = new_matrix.shape[0]
    labels = np.zeros(n)

    for i in range(n):
        index = np.argmax(matrix_labels[i],axis=1) + 1
        labels[i]=index
    return new_data, labels


data,labels = labels_matrix_operation(data,labels) #31 clusters

#labels order: 1~c
unique_values = set(labels)
mapping = {value: index + 1 for index, value in enumerate(sorted(unique_values))}
labels = np.vectorize(lambda x: mapping[x])(labels)
