In [None]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt


sns.set()
groups = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'sci.space']


class StemmedTfidfVectorizer(TfidfVectorizer):
    
    def build_analyzer(self, stemmer=None):
        analyzer = super(StemmedTfidfVectorizer, self).build_analyzer()
        
        if stemmer is None:
            stemmer = SnowballStemmer('english')
        
        return lambda text: (stemmer.stem(w) for w in analyzer(text))


def get_data(groups, type='test', N=1000):
    X = []
    y = []
    
    for group in groups:
        crdir = os.path.join('20news-bydate/' + ('20news-bydate-train' if type=='train' else '20news-bydate-test'), group) 

        for file in os.listdir(crdir):

            with open(os.path.join(crdir, file)) as outfile:
                X.append( outfile.read() )
                y.append(group)
    
    return np.array(X[:N]), np.array(y[:N])

In [None]:
_X_train, _y_train = get_data(groups, 'train')
_X_test, _y_test = get_data(groups)

vectorizer = StemmedTfidfVectorizer(min_df=10, max_df=.5, stop_words='english')
X_train = vectorizer.fit_transform(_X_train)
X_test = vectorizer.transform(_X_test)

In [None]:
from sklearn.cluster import KMeans

km = KMeans(n_clusters=len(groups) + 1, init='random', n_init=1)
km.fit(X_train.toarray())

In [None]:
new_post = _X_test[50]
new_post_label = km.predict(vectorizer.transform([new_post]))[0]
similar_indices  = (km.labels_ == new_post_label).nonzero()

print(new_post)
print('Label:', groups[new_post_label])

In [None]:
_y_train[similar_indices]

In [None]:
print(_X_train[similar_indices][-2])