In [1]:
# Loading Packages
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.datasets import fetch_20newsgroups
import pandas as pd

In [None]:
# Displaying the different topics of train data
topics = fetch_20newsgroups(subset="train")
topics.target_names

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [None]:
# Importing both train and test data
X_train =  pd.DataFrame(fetch_20newsgroups(random_state=1,subset="train").data)
X_test =  pd.DataFrame(fetch_20newsgroups(random_state=1,subset="test").data)

In [None]:
X_train[0]

In [12]:
#get test shape
X_test.shape

(7532, 1)

In [13]:
# get train shape
X_train.shape

(11314, 1)

In [14]:
# Displaying the first 5 rows of train data
X_train.head()

Unnamed: 0,0
0,"From: ab4z@Virginia.EDU (""Andi Beyer"")\nSubjec..."
1,From: timmbake@mcl.ucsb.edu (Bake Timmons)\nSu...
2,From: bc744@cleveland.Freenet.Edu (Mark Ira Ka...
3,From: ray@ole.cdac.com (Ray Berry)\nSubject: C...
4,From: kkeller@mail.sas.upenn.edu (Keith Keller...


In [18]:
#get 2nd row train data
X_train.iloc[1]

0    From: timmbake@mcl.ucsb.edu (Bake Timmons)\nSu...
Name: 1, dtype: object

In [19]:
# Displaying the first 5 rows of test data
X_test.head(5)

Unnamed: 0,0
0,From: traven@pitt.edu (Neal Traven)\nSubject: ...
1,From: eric@ithaca.com (Eric Wagner)\nSubject: ...
2,From: umturne4@ccu.umanitoba.ca (Daryl Turner)...
3,From: kem@prl.ufl.edu (Kelly Murray)\nSubject:...
4,From: scott@cs.uiuc.edu (Jay Scott)\nSubject: ...


In [20]:
# Tf-idf
vectorizer = TfidfVectorizer(max_df =0.5)

In [22]:
# SVD
from sklearn.decomposition import TruncatedSVD 
svd_model = TruncatedSVD(n_components=500, random_state=42) 

In [23]:
# Building the pipeline
from sklearn.pipeline import Pipeline
svd_transformer = Pipeline([('tfidf',vectorizer ), 
                            ('svd',svd_model )])
preprocessing_model = svd_transformer.fit(X_train[0])
svd_matrix_train = preprocessing_model.transform(X_train[0])

In [24]:
svd_matrix_train.shape

(11314, 500)

In [25]:
vectorizer.vocabulary_

{'ab4z': 25240,
 'virginia': 121620,
 'andi': 28162,
 'beyer': 33006,
 'israeli': 68682,
 'terrorism': 114184,
 'university': 118959,
 '15': 4605,
 'well': 123735,
 'sure': 112011,
 'about': 25399,
 'story': 110860,
 'nad': 85907,
 'did': 47135,
 'seem': 105818,
 'biased': 33186,
 'what': 123960,
 'disagree': 47472,
 'your': 128393,
 'statement': 110342,
 'media': 80785,
 'out': 90756,
 'ruin': 103448,
 'israels': 68690,
 'reputation': 101115,
 'rediculous': 100182,
 'most': 83823,
 'pro': 96017,
 'world': 125028,
 'having': 62225,
 'lived': 76223,
 'europe': 52811,
 'realize': 99795,
 'incidences': 66690,
 'such': 111513,
 'as': 29618,
 'one': 89868,
 'described': 46533,
 'letter': 75396,
 'occured': 89160,
 'whole': 124130,
 'try': 116698,
 'ignore': 65885,
 'them': 114486,
 'subsidizing': 111399,
 'existance': 53305,
 'europeans': 52813,
 'at': 30042,
 'least': 75054,
 'same': 104475,
 'degree': 46005,
 'so': 108539,
 'think': 114674,
 'might': 81985,
 'reason': 99846,
 'they': 1146

In [29]:
# Querying a document related to one of the document in test data
query = preprocessing_model.transform(X_test.iloc[2])

In [30]:
query.shape

(1, 500)

In [31]:
# Calculating cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
distance_matrix = cosine_similarity(svd_matrix_train,query)

In [32]:
print(distance_matrix)

[[0.12532047]
 [0.17688509]
 [0.112492  ]
 ...
 [0.09898542]
 [0.0243373 ]
 [0.22394193]]


In [33]:
# Function to return indices of top n elemets
def largest_indices(ary, n):
    """Returns the n largest indices from a numpy array."""
    flat = ary.flatten()
    indices = np.argsort(flat)[-n:]
    return indices

In [34]:
largest_indices(distance_matrix,5)

array([ 8872,  1392,   491,  6260, 10460], dtype=int64)

In [35]:
X_test[0][2]

"From: umturne4@ccu.umanitoba.ca (Daryl Turner)\nSubject: Re: Winnipeg vs. Vancouver\nNntp-Posting-Host: gibson.cc.umanitoba.ca\nOrganization: University of Manitoba, Winnipeg, Manitoba, Canada\nLines: 85\n\nIn article <C63p9q.205@unixhub.SLAC.Stanford.EDU> grogers@ravel.SLAC.Stanford.EDU (Greg Rogers) writes:\n>>> MKR@stud.hsn.no (MORTEN KRISTIANSEN) writes:\n>>>\n>>>Read this all you Canucks fan out there!!!!!!!!!!!!!!!!!!!!\n>>>Winnipeg are going to kick Vancouvers butts so badly they are not going to\n>>>be able to sit down for weeks.\n>>>And no this is NOT a joke.\n>>>A prediction before the fourth game in Winnipeg:\n>>>\n>>>Winnipeg 6 Vancouver 2  (2-0,1-2,3-0)\n>>>Selanne, Steen score a couple of goals each!!!\n>>>\n>>>Winnipeg to win Stanley cup playoffs.>>\n>>\n>>\n>>To all you Jets Fanatics...Ha ha ha.\n>>To normal Jets Fans...It was a pretty good fourth game. More physical\n>>play than I've seen in the first three games.>\n>>\n>>These moronic posts that have continued throug

In [38]:
X_train[0][0]

'From: ab4z@Virginia.EDU ("Andi Beyer")\nSubject: Re: Israeli Terrorism\nOrganization: University of Virginia\nLines: 15\n\nWell i\'m not sure about the story nad it did seem biased. What\nI disagree with is your statement that the U.S. Media is out to\nruin Israels reputation. That is rediculous. The U.S. media is\nthe most pro-israeli media in the world. Having lived in Europe\nI realize that incidences such as the one described in the\nletter have occured. The U.S. media as a whole seem to try to\nignore them. The U.S. is subsidizing Israels existance and the\nEuropeans are not (at least not to the same degree). So I think\nthat might be a reason they report more clearly on the\natrocities.\n\tWhat is a shame is that in Austria, daily reports of\nthe inhuman acts commited by Israeli soldiers and the blessing\nreceived from the Government makes some of the Holocaust guilt\ngo away. After all, look how the Jews are treating other races\nwhen they got power. It is unfortunate.\n'