### As before, we import the libraries and code bases we need

### This time, inverted_index and utils is mine... Note source files used in this context have to live within the context of where the jupyter notebook is being run...

In [1]:
from inverted_index import InvertedIndex
import nltk
from utils import read_data
nltk.download('stopwords')
inv_ind = InvertedIndex()

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ivanyanakiev1/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Add documents below, read_data scans directory passed for any 
### files ending in ".txt" and reads them in as a single string.

In [None]:
documents = read_data("./data")
print(documents)

### Print out number of documents and document titles

In [None]:
print(len(documents))
for d in documents:
    print(d[0])

### Next, we will add all these documents to our Inverted Index...

In [4]:
for d in documents:
    print(d[0])
    inv_ind.add_document(d)

barrack_hussein_obama
obama
president_of_the_united_states
modi_visit_us
united_states_of_america
narendra_damodardas_modi
united_states_presidential_election_2016
foreign_investment_to_gujrat
hillary_diane_rodham_clinton


### Print out some descriptives, total terms indexed and documents...

In [None]:
print(inv_ind.get_total_terms())
print(inv_ind.get_total_docs())
print(inv_ind.terms)

### Print out the inverted index itself....

In [None]:
inv_ind.print()

### Just for interest's sake, you can see nltk's built in stop word list

In [7]:
import nltk
print(nltk.corpus.stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

### From this we can geneate a term by document matrix 

In [None]:
print(inv_ind.generate_term_by_doc_matrix())

#### We can compute the TFIDF values for everything in the Inverted Index
#### Display new values

In [None]:
inv_ind.calcTFIDF()
print(inv_ind.generate_term_by_doc_matrix(tfidf = True))

### Let's do a search....

In [10]:
results = inv_ind.search("largest world economy", tfidf = True)
for r in results:
    print (r)


('united_states_of_america', 0.22981691131823873)
('president_of_the_united_states', 0.1360635635542465)
('modi_visit_us', 0.033694037950494415)
('obama', 0.0055882695953022865)
('barrack_hussein_obama', 0.0)
('foreign_investment_to_gujrat', 0.0)
('hillary_diane_rodham_clinton', 0.0)
('narendra_damodardas_modi', 0.0)
('united_states_presidential_election_2016', 0.0)


### We can deal with boolean queries too... let's get some data.

In [11]:
obama = inv_ind.get_document_set_from_term("Obama")
trump = inv_ind.get_document_set_from_term("Trump")
bernie = inv_ind.get_document_set_from_term("Bernie Sanders")
print(obama)
print(trump)
print(bernie)

{'president_of_the_united_states', 'united_states_presidential_election_2016', 'obama', 'barrack_hussein_obama', 'hillary_diane_rodham_clinton', 'modi_visit_us'}
{'united_states_presidential_election_2016'}
{'united_states_presidential_election_2016'}


### We have sets containing the infor for Obama, Trump and Sanders
### Which documents discuss all 3?

In [12]:
print(obama & trump & bernie)

{'united_states_presidential_election_2016'}


### Let's use a larger dataset, we will use the complete works 
### of Shakespeare next... First, load it in.

In [None]:
documents = read_data("./shakespeare")
inv_ind = InvertedIndex()
for d in documents:
    print(d[0])

### Next, add them to the Inverted Index...
### Note some of these documents are 25,000 - 35,000 words...

In [14]:
for d in documents:
    print(d[0])
    inv_ind.add_document(d)

Julius Caesar
Othello
A Midsummer Night's Dream
Troilus and Cressida
King Richard II
King Henry IV, II
Titus Andronicus
Much Ado About Nothing
Love's Labour's Lost
The Two Gentlemen of Verona
The Comedy of Errors
Cymbeline
All's Well that Ends Well
Twelfth Night
King Lear
The Tempest
Macbeth
Venus and Adonis
Timon of Athens
King Henry VIII
The Merchant of Venice
A Lover's Complaint
King Henry VI
Measure for Measure
Collection of Shakespeare Sonnets
Antony and Cleopatra
King John
Coriolanus
King Henry V
The Merry Wives of Windsor
Romeo and Juliet
King Henry IV
Hamlet
King Richard III
Pericles, Prince of Tyre
The Taming of the Shrew
The Winter's Tale
As You Like It
The Rape of Lucrece


### We can reproduce our boolean search gave in the lecture slides...

### Also see how many terms we have...

In [15]:
print(inv_ind.get_total_terms())
caes = inv_ind.get_document_set_from_term("Caesar")
brut = inv_ind.get_document_set_from_term("Brutus")
cap = inv_ind.get_document_set_from_term("Calpurnia")
print(caes)
print(brut)
print(cap)

19202
{'Cymbeline', "All's Well that Ends Well", 'Measure for Measure', 'Titus Andronicus', 'King Henry VI', 'Antony and Cleopatra', 'The Merry Wives of Windsor', 'Hamlet', 'Macbeth', 'Julius Caesar', 'King Richard II', 'King Henry IV, II', "Love's Labour's Lost", 'As You Like It', 'King Henry V', 'Othello', 'King Richard III'}
{'The Merchant of Venice', 'The Rape of Lucrece', 'Titus Andronicus', 'Antony and Cleopatra', 'Hamlet', 'Julius Caesar', 'Coriolanus', 'King Henry V'}
{'Julius Caesar'}


### Using simple set operations...
### Which are the plays that have Caesar AND Brutus but *NOT* Calpurnia

In [16]:
print((caes & brut) - cap)


{'King Henry V', 'Titus Andronicus', 'Antony and Cleopatra', 'Hamlet'}


### Let's generate our TFIDF data for future work and generate a plain term by document matrix for queries.

In [None]:
inv_ind.calcTFIDF()
inv_ind.generate_term_by_doc_matrix(tfidf = True)

### Now, let's do a query about a Shakespeare play...

In [21]:
results = inv_ind.search("scotland kings and thanes", tfidf = True)
for r in results:
    print (r)

('Macbeth', 0.08559316237351267)
('King Henry IV', 0.005789261723483593)
('King Henry VI', 0.003660436049077642)
('King Henry IV, II', 0.003121709934588564)
('King Henry V', 0.0019193400093131976)
('King Richard III', 0.0013431147327243318)
('King John', 0.0007488196759316429)
('King Richard II', 0.0006742482860831404)
('King Henry VIII', 0.0005161221482695165)
('The Comedy of Errors', 0.00046244490997942336)
('King Lear', 0.00035091625086910766)
("All's Well that Ends Well", 0.00020963725222313367)
('Hamlet', 0.00015544308715244624)
('The Rape of Lucrece', 0.00013990120010572916)
("The Winter's Tale", 0.00010914122054322648)
('Pericles, Prince of Tyre', 8.315765037798248e-05)
('The Tempest', 7.328272381188834e-05)
('Collection of Shakespeare Sonnets', 5.777755663378052e-05)
('Cymbeline', 5.30782444560166e-05)
("Love's Labour's Lost", 5.26090429516884e-05)
('Venus and Adonis', 4.704718099129693e-05)
('Antony and Cleopatra', 2.2354760703677766e-05)
('Titus Andronicus', 1.559695452052161

### Is there anything weird going on here?

In [22]:
king_docs = inv_ind.get_document_set_from_term("King")
print(king_docs)

{'Measure for Measure', 'King Henry VIII', "A Midsummer Night's Dream", 'The Merchant of Venice', 'Collection of Shakespeare Sonnets', 'The Taming of the Shrew', 'Titus Andronicus', 'King Henry VI', 'The Rape of Lucrece', 'Macbeth', 'King Richard II', 'Much Ado About Nothing', 'Twelfth Night', 'King John', "Love's Labour's Lost", 'The Two Gentlemen of Verona', 'As You Like It', 'Pericles, Prince of Tyre', 'Cymbeline', 'Venus and Adonis', 'Antony and Cleopatra', 'The Merry Wives of Windsor', 'Romeo and Juliet', "The Winter's Tale", 'Othello', 'King Henry IV', 'The Tempest', "All's Well that Ends Well", 'King Lear', 'Hamlet', 'Julius Caesar', 'Coriolanus', 'King Henry IV, II', 'Troilus and Cressida', 'King Henry V', 'King Richard III'}


In [23]:
king_pl = inv_ind.get_postings_list_from_term("King")
print(king_pl)

[['Julius Caesar', 4, 0.3201708306941454, None], ['Othello', 1, 0.08004270767353636, None], ["A Midsummer Night's Dream", 6, 0.48025624604121814, None], ['Troilus and Cressida', 11, 0.8804697844088999, None], ['King Richard II', 289, 23.132342517652006, None], ['King Henry IV, II', 141, 11.286021781968627, None], ['Titus Andronicus', 13, 1.0405551997559725, None], ['Much Ado About Nothing', 1, 0.08004270767353636, None], ["Love's Labour's Lost", 40, 3.2017083069414545, None], ['The Two Gentlemen of Verona', 3, 0.24012812302060907, None], ['Cymbeline', 45, 3.601921845309136, None], ["All's Well that Ends Well", 136, 10.885808243600945, None], ['Twelfth Night', 3, 0.24012812302060907, None], ['King Lear', 310, 24.813239378796272, None], ['The Tempest', 42, 3.361793722288527, None], ['Macbeth', 47, 3.7620072606562087, None], ['Venus and Adonis', 3, 0.24012812302060907, None], ['King Henry VIII', 254, 20.330847749078234, None], ['The Merchant of Venice', 5, 0.4002135383676818, None], ['Kin

In [24]:
scotland_pl = inv_ind.get_postings_list_from_term("Scotland")
print(scotland_pl)

[['King Henry IV, II', 3, 5.152954491222999, None], ['The Comedy of Errors', 1, 1.717651497074333, None], ['Macbeth', 15, 25.764772456114997, None], ['King Henry VI', 4, 6.870605988297332, None], ['King Henry V', 1, 1.717651497074333, None], ['King Henry IV', 6, 10.305908982445999, None], ['King Richard III', 1, 1.717651497074333, None]]


In [25]:
thane_pl = inv_ind.get_postings_list_from_term("Thane")
print(thane_pl)

[['Macbeth', 31, 113.57041103001903, None]]


[]
