In [0]:
import pandas as pd
import numpy as np
import sys
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [0]:
from sklearn.datasets import fetch_20newsgroups
train = fetch_20newsgroups(subset='train', shuffle=True)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [0]:
train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [0]:
train.target

array([7, 4, 4, ..., 3, 1, 8])

In [0]:
print("\n".join(train.data[0].split("\n")[:]))

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----







In [0]:
train.filenames.shape

(11314,)

In [0]:
train.target.shape

(11314,)

In [0]:
train.target[:10]

array([ 7,  4,  4,  1, 14, 16, 13,  3,  2,  4])

In [0]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [0]:
from nltk.corpus import stopwords
vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'))
vectors = vectorizer.fit_transform(train.data)
vectors.shape

(11314, 129963)

In [0]:
from sklearn.naive_bayes import MultinomialNB  # model building
from sklearn import metrics
test = fetch_20newsgroups(subset='test',shuffle=True)
vectors_test = vectorizer.transform(test.data)
clf = MultinomialNB(alpha=0.01)
clf.fit(vectors, train.target)
pred = clf.predict(vectors_test)
metrics.f1_score(test.target, pred, average='macro')

0.8297659298702685

In [0]:
metrics.accuracy_score(test.target, pred)

0.8356346255974508

In [0]:
predtr = clf.predict(vectors)

In [0]:
metrics.accuracy_score(train.target, predtr)

0.9554534205409227

In [0]:
def show_top10(classifier, vectorizer, categories):
     feature_names = np.asarray(vectorizer.get_feature_names())
     for i, category in enumerate(categories):
         top10 = np.argsort(classifier.coef_[i])[-15:]
         print("%s: %s" % (category, " ".join(feature_names[top10])))

In [0]:
show_top10(clf, vectorizer, train.target_names)  # top 10 features

alt.atheism: solntze islam morality writes one people sgi atheism com livesey atheists caltech god edu keith
comp.graphics: program posting file thanks organization polygon subject com university lines files 3d image edu graphics
comp.os.ms-windows.misc: lines university use mouse card com ax ms drivers driver files dos file edu windows
comp.sys.ibm.pc.hardware: drives thanks dos system isa disk pc com controller bus edu card ide drive scsi
comp.sys.mac.hardware: problem lc subject duo lines scsi simms monitor centris se quadra drive edu apple mac
comp.windows.x: organization subject lines lcs uk application x11r5 xterm widget edu com server mit motif window
misc.forsale: mail forsale price organization subject lines condition distribution university new shipping offer 00 edu sale
rec.autos: university like dealer lines organization subject oil writes would article engine edu cars com car
rec.motorcycles: subject helmet organization bmw riding writes ca article motorcycle bikes ride ed

In [0]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(vectors, train.target)
rfcpred = rfc.predict(vectors)
metrics.accuracy_score(train.target, rfcpred)

0.9999116139296447

In [0]:
rfctest = rfc.predict(vectors_test)
metrics.accuracy_score(test.target, rfctest)

0.7818640467339352

In [0]:
print(metrics.classification_report(test.target, rfctest,target_names=test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.72      0.65      0.68       319
           comp.graphics       0.62      0.73      0.67       389
 comp.os.ms-windows.misc       0.67      0.81      0.74       394
comp.sys.ibm.pc.hardware       0.68      0.68      0.68       392
   comp.sys.mac.hardware       0.78      0.79      0.78       385
          comp.windows.x       0.79      0.69      0.73       395
            misc.forsale       0.77      0.90      0.83       390
               rec.autos       0.84      0.84      0.84       396
         rec.motorcycles       0.93      0.91      0.92       398
      rec.sport.baseball       0.84      0.90      0.87       397
        rec.sport.hockey       0.88      0.93      0.91       399
               sci.crypt       0.88      0.92      0.90       396
         sci.electronics       0.71      0.53      0.61       393
                 sci.med       0.85      0.72      0.78       396
         

Loading the data by removing header, footer and Quotes and running the model to check the performance of the model.

In [0]:
new_train = fetch_20newsgroups(subset='train',remove=('headers', 'footers', 'quotes'))
vectors = vectorizer.fit_transform(new_train.data)
clf = MultinomialNB()
clf.fit(vectors, new_train.target)
predtr = clf.predict(vectors)

In [0]:
new_test = fetch_20newsgroups(subset='test',remove=('headers', 'footers', 'quotes'))
vectors_test = vectorizer.transform(new_test.data)
pred = clf.predict(vectors_test)
metrics.f1_score(new_test.target, pred, average='macro')

0.6397432017729484

In [0]:
print("\n".join(new_train.data[0].split("\n")[:]))

I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.


In [0]:
metrics.accuracy_score(new_train.target, predtr)

0.8690118437334277

In [0]:
metrics.accuracy_score(new_test.target, pred)

0.6698088157195964

In [0]:
show_top10(clf, vectorizer, new_train.target_names)

alt.atheism: objective morality well bible could islam atheists say religion would atheism think one people god
comp.graphics: hi software please looking format 3d know anyone would program file files thanks image graphics
comp.os.ms-windows.misc: anyone ftp using program ax card problem thanks use driver drivers files dos file windows
comp.sys.ibm.pc.hardware: get isa drives monitor would system disk thanks pc ide controller bus card scsi drive
comp.sys.mac.hardware: lc use know monitor anyone quadra would get simms one thanks problem drive apple mac
comp.windows.x: mit display get program xterm using windows x11r5 use application thanks widget server motif window
misc.forsale: used mail drive interested asking email sell price please new condition shipping offer 00 sale
rec.autos: think also know oil one ford new good dealer get engine would like cars car
rec.motorcycles: right would know bmw dog get helmet riding like motorcycle ride one bikes dod bike
rec.sport.baseball: good would

In [0]:
from sklearn.linear_model import LogisticRegression

In [0]:
log = LogisticRegression(multi_class='multinomial')
log.fit(vectors, new_train.target)
logpred = log.predict(vectors)
metrics.accuracy_score(new_train.target, logpred)

0.919833834187732

In [0]:
log_test = log.predict(vectors_test)
metrics.accuracy_score(new_test.target, log_test)

0.6887944768985661

In [0]:
from sklearn.ensemble import RandomForestClassifier

In [0]:
rfc = RandomForestClassifier()
rfc.fit(vectors, new_train.target)
rfcpred = rfc.predict(vectors)
metrics.accuracy_score(new_train.target, rfcpred)

0.9740144953155383

In [0]:
rfctest = rfc.predict(vectors_test)
metrics.accuracy_score(new_test.target, rfctest)

0.6218799787573022

In [0]:
X_train =  pd.DataFrame(fetch_20newsgroups(random_state=1,subset="train").data)
X_test = pd.DataFrame(fetch_20newsgroups(random_state=1,subset="test").data)

In [0]:
X_train.head()

Unnamed: 0,0
0,"From: ab4z@Virginia.EDU (""Andi Beyer"")\nSubjec..."
1,From: timmbake@mcl.ucsb.edu (Bake Timmons)\nSu...
2,From: bc744@cleveland.Freenet.Edu (Mark Ira Ka...
3,From: ray@ole.cdac.com (Ray Berry)\nSubject: C...
4,From: kkeller@mail.sas.upenn.edu (Keith Keller...


In [0]:
count_vect = CountVectorizer()
tf = count_vect.fit_transform(train.data)
tf.shape

(11314, 130107)

In [0]:
tf

<11314x130107 sparse matrix of type '<class 'numpy.int64'>'
	with 1787565 stored elements in Compressed Sparse Row format>

TfidfVectorizer is used on sentences, while TfidfTransformer is used on an existing count matrix, such as one returned by CountVectorizer

SVD

In [0]:
# Importing both train and test data
X_train =  pd.DataFrame(fetch_20newsgroups(random_state=1,subset="train",remove=('headers', 'footers', 'quotes')).data)
X_test = pd.DataFrame(fetch_20newsgroups(random_state=1,subset="test",remove=('headers', 'footers', 'quotes')).data)

In [0]:
from sklearn.decomposition import TruncatedSVD
svd_model = TruncatedSVD(n_components=20,random_state=42)

In [0]:
# Tf-idf
tfidf = TfidfVectorizer(max_df = 0.7,stop_words='english')

In [0]:
from sklearn.pipeline import Pipeline
svd_transformer = Pipeline([('tfidf', tfidf), 
                            ('svd', svd_model)])
preprocessing_model = svd_transformer.fit(X_train[0])
svd_matrix_train = preprocessing_model.transform(X_train[0])

In [0]:
X_train.head()

Unnamed: 0,0
0,Well i'm not sure about the story nad it did s...
1,"\n\n\n\n\n\n\nYeah, do you expect people to re..."
2,Although I realize that principle is not one o...
3,Notwithstanding all the legitimate fuss about ...
4,"Well, I will have to change the scoring on my ..."


In [0]:
X_train[0]

0        Well i'm not sure about the story nad it did s...
1        \n\n\n\n\n\n\nYeah, do you expect people to re...
2        Although I realize that principle is not one o...
3        Notwithstanding all the legitimate fuss about ...
4        Well, I will have to change the scoring on my ...
                               ...                        
11309    Danny Rubenstein, an Israeli journalist, will ...
11310                                                   \n
11311    \nI agree.  Home runs off Clemens are always m...
11312    I used HP DeskJet with Orange Micros Grappler ...
11313                                          ^^^^^^\n...
Name: 0, Length: 11314, dtype: object

In [0]:
svd_matrix_train.shape

(11314, 20)

In [0]:
svd_matrix_train

array([[ 0.08690086, -0.0527846 , -0.01616238, ...,  0.0002687 ,
         0.00443517, -0.01074491],
       [ 0.12571513, -0.03760951,  0.01866625, ..., -0.00077117,
         0.01300213,  0.02119549],
       [ 0.11543569, -0.05219267, -0.02285513, ...,  0.01157636,
        -0.04126548, -0.00614087],
       ...,
       [ 0.06512141, -0.01969874, -0.02096021, ...,  0.00227408,
         0.0074438 ,  0.00023347],
       [ 0.0335365 ,  0.02203741, -0.0009162 , ..., -0.00957569,
         0.00408981,  0.00810349],
       [ 0.17359082, -0.03098308, -0.0483443 , ..., -0.01604614,
         0.02073289,  0.02829521]])

In [0]:
# Querying a document related to one of the document in test data
query = preprocessing_model.transform(X_test.iloc[3])

In [0]:
X_test[0][3]

'\nI would further add that a 486/50,S3/928,8mb,15",200mbDISK is going to cost\n/WAY/ more than $900, probably $3,000.  Color makes it not apples/apples too.\nXterminals provide better price/performance than PCs.  You can make a PC\nmuch cheaper, and perform much worse, and you can make PC\'s perform great, and\ncost more.  You pay extra for the additional functinality and expandability\nof a PC.  For home user, that extra functionality is worth the added cost.'

In [0]:
query.shape

(1, 20)

In [0]:
query

array([[ 0.10794637,  0.05212298, -0.02256459, -0.00445603, -0.03443909,
         0.02965227,  0.01295297, -0.02344415,  0.03132671, -0.05550064,
        -0.00611564,  0.02284804, -0.02765203, -0.01307706,  0.01529244,
         0.03147862,  0.05286134,  0.02611598,  0.00770828,  0.02992505]])

In [0]:
# Calculating cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
distance_matrix = cosine_similarity(svd_matrix_train,query)

In [0]:
print(distance_matrix)

[[0.17763299]
 [0.37407271]
 [0.08994037]
 ...
 [0.34500256]
 [0.6130473 ]
 [0.46441076]]


In [0]:
print(distance_matrix.shape, type(distance_matrix))

(11314, 1) <class 'numpy.ndarray'>


In [0]:
flat = distance_matrix.flatten()

In [0]:
flat

array([0.17763299, 0.37407271, 0.08994037, ..., 0.34500256, 0.6130473 ,
       0.46441076])

In [0]:
print(flat.shape,type(flat))

(11314,) <class 'numpy.ndarray'>


In [0]:
sort_indices = np.argsort(flat)

In [0]:
sort_indices

array([6417, 9823, 6536, ...,    6, 8909, 6559])

In [0]:
sort_indices[-5:]

array([4637, 8344,    6, 8909, 6559])

In [0]:
print(distance_matrix[6559],distance_matrix[8909],distance_matrix[6],distance_matrix[8344],distance_matrix[4637])

[0.89420752] [0.86455509] [0.85386399] [0.85352483] [0.84857162]


In [0]:
X_test[0][3]

'\nI would further add that a 486/50,S3/928,8mb,15",200mbDISK is going to cost\n/WAY/ more than $900, probably $3,000.  Color makes it not apples/apples too.\nXterminals provide better price/performance than PCs.  You can make a PC\nmuch cheaper, and perform much worse, and you can make PC\'s perform great, and\ncost more.  You pay extra for the additional functinality and expandability\nof a PC.  For home user, that extra functionality is worth the added cost.'

In [0]:
X_train[0][6559]

"\nOpenGL is a graphics programming library and as such is a great, portable\ninterface for the development of interactive 3D graphics applications. It\nis not, however, an indicator of performance, as that will vary strongly\nfrom machine to machine and vendor to vendor.  SGI is committed to high\nperformance interactive graphics systems and software tools, so OpenGL\nmeans that you can port easily from SGI to other platforms, there is no\nguarantee that your performance would be comparable.\n\n\nIf your sales representative truly mislead you, then you should have a\nvalid grievance against us which you should carry up to your local SGI\nsales management team. Feel free to contact the local branch manager...we\nunderstand that repeat sales come from satisfied customers, so give it a\nshot.\n\n\nAs one of the previous posts stated, the Personal IRIS was introduced in\n1988 and grew to include the 4D/20, 4D/25, 4D/30 and 4D/35 as clock rates\nsped up over time. As a rule of thumb, SGI p

CLUSTERING

In [0]:
from sklearn.cluster import KMeans

In [0]:
number_of_clusters = 20

model = KMeans(n_clusters=number_of_clusters, 
               init='k-means++', 
               max_iter=100, # Maximum number of iterations of the k-means algorithm for a single run.
               n_init=1)  # Number of time the k-means algorithm will be run with different centroid seeds. The final results will be the best output of n_init consecutive runs in terms of inertia.

model.fit(vectors)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
       n_clusters=20, n_init=1, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [0]:
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

In [0]:
for i in range(number_of_clusters):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind])

Cluster 0:
 __
 ___
 _____
 cosmo
 baalke
 risc
 angmar
 ____
 jpl
 kelvin
Cluster 1:
 columbia
 gld
 cunixb
 cc
 dare
 gary
 edu
 keenan
 cunixa
 insurance
Cluster 2:
 andrew
 cmu
 edu
 carnegie
 mellon
 pittsburgh
 pa
 host
 nntp
 posting
Cluster 3:
 scsi
 drive
 ide
 controller
 drives
 hard
 disk
 bus
 floppy
 hd
Cluster 4:
 com
 lines
 subject
 organization
 ca
 would
 edu
 thanks
 one
 posting
Cluster 5:
 people
 would
 edu
 gun
 one
 com
 think
 team
 like
 ca
Cluster 6:
 windows
 dos
 access
 window
 file
 digex
 files
 edu
 ms
 com
Cluster 7:
 geb
 banks
 gordon
 pitt
 cs
 dsl
 n3jxp
 cadre
 chastity
 shameful
Cluster 8:
 israel
 israeli
 jews
 arab
 jake
 arabs
 edu
 lebanese
 adam
 israelis
Cluster 9:
 hp
 com
 msg
 stratus
 sw
 edu
 food
 behanna
 dyer
 cdt
Cluster 10:
 nasa
 gov
 ax
 larc
 space
 jpl
 gsfc
 jsc
 higgins
 howland
Cluster 11:
 key
 clipper
 encryption
 chip
 escrow
 keys
 government
 com
 crypto
 algorithm
Cluster 12:
 card
 video
 monitor
 drivers
 vga
 edu

In [0]:
print (train.data[350])

From: kozloce@wkuvx1.bitnet
Subject: Re: Atlanta Hockey Hell!!DIR
Organization: Western Kentucky University, Bowling Green, KY
Lines: 18

> Someone give SportChannel a call (or maybe Ted Turner).
> Wasn't USA network covering the playoffs years ago?
> 
> Jim G.

Oh to be back in the good old days when I lived in Florida (Florida for
Petes sake!!) and could watch hockey every night as ESPN and USA alternated
coverage nights. Oh well I guess it would be too simple for the home office
to look back into their past to solve a problem in the present...

Of course I shouldn't complain. At least I'm getting to watch the playoffs
for a change. (Hooray!!) Now if the ESPN schedulers will realise there are
other teams except Pittsberg in the Patrick. (Sounds like a Dr Suess Book
=)

KOZ

LETS GO CAPS!!



In [0]:
X = vectorizer.transform([train.data[350]])

cluster = model.predict(X)[0]

print("Text belongs to cluster number {0}".format(cluster))

Text belongs to cluster number 5


In [0]:
for ind in order_centroids[cluster, :10]:
    print(' %s' % terms[ind])

 people
 would
 edu
 gun
 one
 com
 think
 team
 like
 ca
