In [1]:
# preamble to be able to run notebooks in Jupyter and Colab
try:
    from google.colab import drive
    import sys
    
    drive.mount('/content/drive')
    notes_home = "/content/drive/Shared drives/CSC310/notes/"
    user_home = "/content/drive/My Drive/"
    
    sys.path.insert(1,notes_home) # let the notebook access the notes folder

except ModuleNotFoundError:
    notes_home = "" # running native Jupyter environment -- notes home is the same as the notebook
    user_home = ""  # under Jupyter we assume the user directory is the same as the notebook

# NLP

## Vector Model

In [2]:
import pandas
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import euclidean_distances

doc_names = ["doc1", "doc2", "doc3"]
docs = ["the quick brown fox jumps over the lazy dog",
        "Rudi is a lazy brown dog",
        "Princess jumps over the lazy dog"]

# process documents
vectorizer = CountVectorizer(analyzer = "word", binary = True)
docarray = vectorizer.fit_transform(docs).toarray()
coords = vectorizer.get_feature_names()
docterm = pandas.DataFrame(data=docarray,index=doc_names,columns=coords)
print("Coordinates:")
print(coords)
print("\nDocterm:")
print(docterm)

# pairwise distances
distances = euclidean_distances(docterm)
distances_df = pandas.DataFrame(data=distances, index=doc_names, columns=doc_names)
print("\nPairwise Distances:")
print(distances_df)

Coordinates:
['brown', 'dog', 'fox', 'is', 'jumps', 'lazy', 'over', 'princess', 'quick', 'rudi', 'the']

Docterm:
      brown  dog  fox  is  jumps  lazy  over  princess  quick  rudi  the
doc1      1    1    1   0      1     1     1         0      1     0    1
doc2      1    1    0   1      0     1     0         0      0     1    0
doc3      0    1    0   0      1     1     1         1      0     0    1

Pairwise Distances:
          doc1      doc2      doc3
doc1  0.000000  2.645751  2.000000
doc2  2.645751  0.000000  2.645751
doc3  2.000000  2.645751  0.000000


## Real World Data

In [3]:
import pandas
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.datasets import fetch_20newsgroups

cats = ['talk.politics.misc', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train', categories=cats)
print(len(newsgroups_train.data))
print(list(newsgroups_train.target_names))
print(newsgroups_train.target.shape)
print(newsgroups_train.data[5])
print(newsgroups_train.target_names[newsgroups_train.target[5]])


1058
['sci.space', 'talk.politics.misc']
(1058,)
From: nickh@CS.CMU.EDU (Nick Haines)
Subject: Re: Vandalizing the sky.
In-Reply-To: todd@phad.la.locus.com's message of Wed, 21 Apr 93 16:28:00 GMT
Originator: nickh@SNOW.FOX.CS.CMU.EDU
Nntp-Posting-Host: snow.fox.cs.cmu.edu
Organization: School of Computer Science, Carnegie Mellon University
	<1993Apr21.162800.168967@locus.com>
Lines: 33

In article <1993Apr21.162800.168967@locus.com> todd@phad.la.locus.com (Todd Johnson) writes:

   As for advertising -- sure, why not?  A NASA friend and I spent one
   drunken night figuring out just exactly how much gold mylar we'd need
   to put the golden arches of a certain American fast food organization
   on the face of the Moon.  Fortunately, we sobered up in the morning.

Hmmm. It actually isn't all that much, is it? Like about 2 million
km^2 (if you think that sounds like a lot, it's only a few tens of m^2
per burger that said organization sold last year). You'd be best off
with a reflective 



## Let us compute the docterm matrix for the news articles


In [4]:
import pandas
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.datasets import fetch_20newsgroups

cats = ['talk.politics.misc', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train', categories=cats)

# process documents                                                                                               
vectorizer = CountVectorizer(analyzer = "word", binary = True)
docarray = vectorizer.fit_transform(newsgroups_train.data).toarray()
print("docarray shape: {}".format(docarray.shape))
print("first 10 coords: {}".format(vectorizer.get_feature_names()[:10]))

docarray shape: (1058, 23537)
first 10 coords: ['00', '000', '0000', '00000', '000000', '000007', '000021', '000062david42', '00041032', '0004136']


## Let us do more filtering: min word freq = 2, only words

In [5]:
import pandas
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.datasets import fetch_20newsgroups
from re import sub

cats = ['talk.politics.misc', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train', categories=cats)

# process documents                                                                                               
vectorizer = CountVectorizer(analyzer = "word", 
                             binary = True, 
                             min_df=2)
new_data = []
for i in range(len(newsgroups_train.data)):
    new_data.append(sub("[^a-zA-Z]", " ", newsgroups_train.data[i]))
docarray = vectorizer.fit_transform(new_data).toarray()
                                                                                                 
print("docarray shape: {}".format(docarray.shape))
print("first 10 coords: {}".format(vectorizer.get_feature_names()[:10]))

docarray shape: (1058, 11836)
first 10 coords: ['aa', 'aammmaaaazzzzzziinnnnggggg', 'aaron', 'aas', 'ab', 'abandon', 'abandoned', 'abandonment', 'abbey', 'abc']


## Stemming

In [6]:
import pandas
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.datasets import fetch_20newsgroups
from re import sub
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
cats = ['talk.politics.misc', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train', categories=cats)

new_data = []
for i in range(len(newsgroups_train.data)):
    new_data.append(sub("[^a-zA-Z]", " ", newsgroups_train.data[i]))

lowercase_data = []
for i in range(len(new_data)):
    lowercase_data.append(new_data[i].lower())

stemmed_data = []
for i in range(len(lowercase_data)):
    words = lowercase_data[i].split()
    stemmed_words = []
    for w in words:
        stemmed_words.append(stemmer.stem(w))
    stemmed_data.append(" ".join(stemmed_words))

vectorizer = CountVectorizer(analyzer = "word", binary = True, min_df=2)
docarray = vectorizer.fit_transform(stemmed_data).toarray()

print("docarray shape: {}".format(docarray.shape))
print("first 10 coords: {}".format(vectorizer.get_feature_names()[:10]))

docarray shape: (1058, 8631)
first 10 coords: ['aa', 'aammmaaaazzzzzziinnnnggggg', 'aaron', 'ab', 'abandon', 'abbey', 'abc', 'abdkw', 'abett', 'abid']


## We can now look at the distances in 8000+ dimensional space

In [7]:
distances = euclidean_distances(docarray)
distances_df = pandas.DataFrame(data=distances)
distances_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1048,1049,1050,1051,1052,1053,1054,1055,1056,1057
0,0.000000,13.038405,14.730920,15.362291,12.609520,15.779734,16.852300,12.727922,13.711309,12.449900,...,16.462078,12.649111,15.198684,28.195744,13.747727,13.416408,17.860571,14.000000,12.369317,15.231546
1,13.038405,0.000000,13.820275,15.099669,12.206556,15.132746,16.062378,11.916375,12.165525,11.958261,...,15.329710,11.661904,13.964240,28.053520,13.152946,12.727922,17.058722,13.038405,11.704700,13.856406
2,14.730920,13.820275,0.000000,15.716234,13.564660,16.248077,16.155494,13.453624,14.387495,13.564660,...,16.186414,13.453624,15.099669,27.784888,14.628739,13.964240,17.832555,14.594520,13.564660,15.132746
3,15.362291,15.099669,15.716234,0.000000,14.798649,16.703293,17.320508,14.764823,15.362291,14.866069,...,17.233688,14.628739,16.522712,28.231188,15.198684,14.696938,18.193405,15.684387,14.866069,16.186414
4,12.609520,12.206556,13.564660,14.798649,0.000000,15.165751,15.968719,11.357817,12.529964,11.661904,...,15.684387,11.180340,14.560220,27.856777,12.806248,12.369317,17.378147,13.000000,11.661904,13.964240
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1053,13.416408,12.727922,13.964240,14.696938,12.369317,15.264338,15.874508,11.832160,13.190906,12.288206,...,15.524175,12.165525,14.177447,28.017851,13.304135,0.000000,17.117243,13.190906,12.288206,14.491377
1054,17.860571,17.058722,17.832555,18.193405,17.378147,18.493242,19.209373,16.822604,17.349352,16.970563,...,19.078784,16.643317,17.720045,28.879058,17.320508,17.117243,0.000000,17.691806,17.029386,18.193405
1055,14.000000,13.038405,14.594520,15.684387,13.000000,15.394804,16.248077,12.649111,13.564660,12.609520,...,11.532563,12.247449,14.730920,28.160256,13.747727,13.190906,17.691806,0.000000,12.922848,14.422205
1056,12.369317,11.704700,13.564660,14.866069,11.661904,15.165751,16.093477,11.445523,12.369317,11.224972,...,15.748016,11.000000,13.928388,27.712813,12.328828,12.288206,17.029386,12.922848,0.000000,13.601471


## Find out which stories are most similar

In [8]:
import sys

# map 0.0 across the major diagonal into FLOAT_MAX
new_df = distances_df.apply(lambda c: c.apply(lambda x: sys.float_info.max if x == 0.0 else x))

In [9]:
# find the column with the minimal value
new_df.min().idxmin()

930

In [10]:
# find the row with the minimal value
new_df.iloc[:,930].idxmin()

1036

In [11]:
# these two news stories are most similar
new_df.iloc[1036, 930]

1.0

In [12]:
print(newsgroups_train.target_names[newsgroups_train.target[1036]])
print(newsgroups_train.target_names[newsgroups_train.target[930]])

sci.space
sci.space


In [13]:
print(newsgroups_train.data[1036])

Subject: <None>
From: bioccnt@otago.ac.nz
Organization: University of Otago, Dunedin, New Zealand
Nntp-Posting-Host: thorin.otago.ac.nz
Lines: 12


Can someone please remind me who said a well known quotation? 

He was sitting atop a rocket awaiting liftoff and afterwards, in answer to
the question what he had been thinking about, said (approximately) "half a
million components, each has to work perfectly, each supplied by the lowest
bidder....." 

Attribution and correction of the quote would be much appreciated. 

Clive Trotman




In [14]:
print(newsgroups_train.data[930])

Subject: Quotation? Lowest bidder...
From: bioccnt@otago.ac.nz
Organization: University of Otago, Dunedin, New Zealand
Nntp-Posting-Host: thorin.otago.ac.nz
Lines: 12


Can someone please remind me who said a well known quotation? 

He was sitting atop a rocket awaiting liftoff and afterwards, in answer to
the question what he had been thinking about, said (approximately) "half a
million components, each has to work perfectly, each supplied by the lowest
bidder....." 

Attribution and correction of the quote would be much appreciated. 

Clive Trotman




> It is a reposting where just the subject of the message changed!