In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
from nltk.stem import SnowballStemmer
from sklearn.metrics import mean_squared_error as mse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt


sns.set()
groups = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'sci.space']


class StemmedTfidfVectorizer(TfidfVectorizer):
    
    def build_analyzer(self, stemmer=None):
        analyzer = super(StemmedTfidfVectorizer, self).build_analyzer()
        
        if stemmer is None:
            stemmer = SnowballStemmer('english')
        
        return lambda text: (stemmer.stem(w) for w in analyzer(text))


def get_data(groups, type='test', N=1000):
    X = []
    y = []
    
    for group in groups:
        crdir = os.path.join('20news-bydate/' + ('20news-bydate-train' if type=='train' else '20news-bydate-test'), group) 

        for file in os.listdir(crdir):

            with open(os.path.join(crdir, file)) as outfile:
                X.append( outfile.read() )
                y.append(group)
    
    return np.array(X[:N]), np.array(y[:N])

In [2]:
_X_train, _y_train = get_data(groups, 'train')
_X_test, _y_test = get_data(groups)

vectorizer = StemmedTfidfVectorizer(min_df=10, max_df=.5, stop_words='english')
X_train = vectorizer.fit_transform(_X_train)
X_test = vectorizer.transform(_X_test)

In [10]:
from sklearn.cluster import KMeans

km = KMeans(n_clusters=len(groups) + 1, init='random', n_init=1)
km.fit(X_train.toarray())

KMeans(algorithm='auto', copy_x=True, init='random', max_iter=300,
    n_clusters=7, n_init=1, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [11]:
new_post = _X_test[50]
new_post_label = km.predict(vectorizer.transform([new_post]))[0]
similar_indices  = (km.labels_ == new_post_label).nonzero()

print(new_post)
print('Label:', groups[new_post_label])

From: bockamp@Informatik.TU-Muenchen.DE (Florian Bockamp)
Subject: Matrox PG-1281 CV Windows driver
Originator: bockamp@hphalle3a.informatik.tu-muenchen.de
Organization: Technische Universitaet Muenchen, Germany
Lines: 22



Hi!

I need a Windows 3.1 driver for the Matrox PG-1281 CV
SVGA card. 
At the moment Windows runs only in the 640x480 mode.
If you have a driver for this card, please send it 
with the OEMSETUP.INF to 

bockamp@Informatik.TU-Muenchen.DE

Thanks!

-- 
+-----------------------------------------------------------------+
| Florian Bockamp                                  '''            |
| bockamp@informatik.tu-muenchen.de               (o o)           |
+---------------------------------------------oOO--( )--OOo-------+
|                                                   -             |
|         "It's not a bug, it's an undocumented feature!"         |
+-----------------------------------------------------------------+

Label: comp.os.ms-windows.misc


In [12]:
_y_train[similar_indices]

array(['comp.graphics', 'comp.graphics', 'comp.graphics', 'comp.graphics',
       'comp.graphics', 'comp.graphics', 'comp.graphics', 'comp.graphics',
       'comp.graphics', 'comp.graphics', 'comp.graphics', 'comp.graphics',
       'comp.graphics', 'comp.graphics', 'comp.graphics', 'comp.graphics',
       'comp.graphics', 'comp.graphics', 'comp.graphics', 'comp.graphics',
       'comp.graphics', 'comp.graphics', 'comp.graphics', 'comp.graphics',
       'comp.graphics', 'comp.graphics', 'comp.os.ms-windows.misc',
       'comp.os.ms-windows.misc', 'comp.os.ms-windows.misc',
       'comp.os.ms-windows.misc', 'comp.os.ms-windows.misc',
       'comp.os.ms-windows.misc', 'comp.os.ms-windows.misc',
       'comp.os.ms-windows.misc', 'comp.os.ms-windows.misc',
       'comp.os.ms-windows.misc', 'comp.os.ms-windows.misc',
       'comp.os.ms-windows.misc', 'comp.os.ms-windows.misc',
       'comp.os.ms-windows.misc', 'comp.os.ms-windows.misc',
       'comp.os.ms-windows.misc', 'comp.os.ms-windows.m

In [13]:
print(_X_train[similar_indices][-2])

From: 55526@brahms.udel.edu (Oliver P Weatherbee)
Subject: New Windows drivers for Cirrus GD5426 graphic cards!
Article-I.D.: news.C5x27u.D4F
Organization: University of Delaware
Lines: 42
Nntp-Posting-Host: brahms.udel.edu


I have uploaded the most recent Windows drivers for the Cirrus GD5426 
chip based display cards to the uploads directory at ftp.cica.indiana.edu
 (file is 5426dr13.zip). They're very recent, I downloaded them from the 
Cirrus BBS (570-226-2365) last night. If you are unable to get them there, 
email me and maybe I can upload them to some other sites as well.  
I have a local bus based card (VL24 Bitblaster from Micron) but I think 
the drivers work with ISA cards (or at least includes drivers for them).

I found the new drivers to be a significant improvement over the 1.2 version, 
improving my graphic winmarks (v3.11) by about 2 million (7.77 to 9.88) 
although this could be the result of intentional benchmark cheating on 
Cirrus's part but I don't think so.

Fro