**---Import Necessary Libraries**

In [2]:
import numpy as np
import pandas as pd
%matplotlib inline

**---Import Data**

In [3]:
from sklearn.datasets import fetch_20newsgroups

In [4]:
all_df = fetch_20newsgroups(subset='all')

**---Check Details of data such as length and Target names**

In [5]:
print(len(all_df.filenames))

18846


In [6]:
print(all_df.target_names)

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


**---Subset only perticular domain related dataset**

In [7]:
groups = ['comp.graphics', 'comp.os.ms-windows.misc',
'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
'comp.windows.x', 'sci.space']

train = fetch_20newsgroups(subset='train', categories=groups)

In [8]:
print(len(train.filenames))

3529


In [9]:
test = fetch_20newsgroups(subset='test', categories=groups)

In [10]:
print(len(test.filenames))

2349


**---Import TfidfVectorizer and transform training data**

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
vect = TfidfVectorizer(min_df=10, max_df=0.5, decode_error='ignore', stop_words='english')

In [13]:
vec_df = vect.fit_transform(train.data)

In [14]:
num_samp, num_feature = vec_df.shape

In [15]:
print('#samples: ', num_samp,'  #features: ',num_feature)

#samples:  3529   #features:  5651


**---Import K Means clustering model and train it with vectorized data**

In [16]:
from sklearn.cluster import KMeans

km = KMeans(n_clusters=20, init='random', n_init=1, verbose=1, random_state=3)

In [17]:
km.fit(vec_df)

Initialization complete
Iteration  0, inertia 6351.274
Iteration  1, inertia 3345.773
Iteration  2, inertia 3319.704
Iteration  3, inertia 3307.397
Iteration  4, inertia 3301.778
Iteration  5, inertia 3298.276
Iteration  6, inertia 3295.715
Iteration  7, inertia 3293.692
Iteration  8, inertia 3292.801
Iteration  9, inertia 3292.248
Iteration 10, inertia 3291.964
Iteration 11, inertia 3291.723
Iteration 12, inertia 3291.561
Iteration 13, inertia 3291.530
Iteration 14, inertia 3291.523
Converged at iteration 14: center shift 0.000000e+00 within tolerance 1.731605e-08


KMeans(algorithm='auto', copy_x=True, init='random', max_iter=300,
       n_clusters=20, n_init=1, n_jobs=None, precompute_distances='auto',
       random_state=3, tol=0.0001, verbose=1)

**---Print clusters and words belong to that cluster**

In [35]:
order_cen = km.cluster_centers_.argsort()[:,::-1]
terms = vect.get_feature_names()

for i in range(20):
    print('cluster %d: ' %i)
    for x in order_cen[i,:20]:
        print('%s' %terms[x])
    print('\n')

cluster 0: 
usc
columbia
southern
angeles
los
california
caspian
yeh
ca
university
jan
host
cc
nntp
posting
zyeh
zhenghao
ctr
wireframe
hp


cluster 1: 
uk
ac
university
nz
posting
mail
host
windows
nntp
does
cam
files
file
know
liverpool
thanks
help
ed
tmc
new


cluster 2: 
nasa
gov
space
jpl
gsfc
baalke
jsc
station
kelvin
___
larc
shuttle
propulsion
center
__
launch
jet
_____
orbit
research


cluster 3: 
card
video
drivers
bus
diamond
vlb
eisa
cards
isa
ati
vesa
24
windows
speedstar
driver
com
controller
local
stealth
cache


cluster 4: 
mac
netcom
se
com
printer
duo
apple
print
use
kth
modem
serial
does
problem
university
ii
nada
408
powerbook
new


cluster 5: 
henry
alaska
toronto
zoo
aurora
nsmca
spencer
zoology
acad3
space
moon
article
just
jacked
writes
utzoo
work
fairbanks
high
adams


cluster 6: 
polygon
points
algorithm
routine
sphere
point
sunset
sunrise
bezier
washington
surface
curves
looking
polygons
times
drexel
fast
circle
reference
detecting


cluster 7: 
drive
scsi
id

**---Evaluate classification model performance**

In [20]:
from sklearn import metrics

print('Homogenity: %0.3f' %metrics.homogeneity_score(train.target, km.labels_))
print('Completeness: 0.3%f' %metrics.completeness_score(train.target, km.labels_))
print('V-measure: 0.3%f' %metrics.v_measure_score(train.target,km.labels_))

Homogenity: 0.360
Completeness: 0.30.240909
V-measure: 0.30.288594


**---Take Input from User and show relevant post related to it**

In [30]:
post = str(input('Enter Your Query:\n'))
newv = vect.transform(post.split())

Enter Your Query:
"Disk drive problems. Hi, I have a problem with my hard disk. After 1 year it is working only sporadically now. I tried to format it, but now it doesn't boot any more. Any ideas? Thanks."


In [57]:
post_label = km.predict(newv)
a=np.argmax((pd.value_counts(post_label)))

In [32]:
data = pd.DataFrame()
data['data_index'] = pd.DataFrame(train.data).index.values
data['cluster'] = km.labels_
data['Str'] = pd.DataFrame(train.data)

In [58]:
possible = data[data['cluster'] == a].head()

In [59]:
print('Related Query:\n')

print(possible['Str'][0])

Related Query:

From: maverick@wpi.WPI.EDU (T. Giaquinto)
Subject: General Information Request
Organization: Worcester Polytechnic Institute, Worcester, MA 01609-2280
Lines: 11
NNTP-Posting-Host: wpi.wpi.edu


	I am looking for any information about the space program.
This includes NASA, the shuttles, history, anything!  I would like to
know if anyone could suggest books, periodicals, even ftp sites for a
novice who is interested in the space program.



					Todd Giaquinto
					maverick@wpi.WPI.EDU
					

