# Importing the necessery libraries needed

In [1]:
from inverted_index import InvertedIndex
import nltk
from utils import read_data
nltk.download('stopwords')
inv_ind = InvertedIndex()

# Initialization done

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ivanyanakiev1/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## We will now proceed to read the documents from the data folder

In [None]:
documents = read_data("./shakespeare")
# Print the first 1 documents
print(documents[1])

## Print the number of the documents as well as their document title

In [None]:
print(len(documents))
for i in documents:
    # Print document title
    print(i[0])

## Add documents to the inverted matrix

In [4]:
for i in documents:
    # Add document to inverted index
    inv_ind.add_document(i)

## Print come descriptives so that we can verify everything works

In [5]:
print(inv_ind.get_total_terms())
print(inv_ind.get_total_docs())

19202
39


## Generate a term by document matrix using log entropy

## Explanation of Log-Entropy and the 2 components of it
Log-Entropy is a statistical analysis of probabilities and calculation of a surprise "index" when certain event occurs. For example 
if a certain event has 90% chance to occur then the Log-Entropy of that event will be low since the surprise factor will be low.

### Component 1
1. This is the logarithm of the term frequency of i in document j. The term frequency can be better described as the probability of term i to occur in document j.
Since this term occurs multiple times throughout one document (potentially or 0) we will need to multiply the natural log of it to the next component.

### Component 2
2. The second term can be interpreted as the actual amount of surprise given a discrete variable X and it's probability P(X). In order to compute the surprise
we need the frequency of the term in the current document in regards to the total fequency.This number will of course be less than 1 and can be interpreted as yet another probability of occurance of the discrete variable X or in our case the term. We then divide by the log of the total number of documents and this final value tha we would have would represent the surprise of occurance of term i. If surprise is low then probability was high, if surprise is high then probability was low.


In [None]:
inv_ind.calcLogEntropy()
inv_ind.generate_term_by_doc_matrix(log_entropy=True)

### Perform a search query: "scotland kings and thanes" using the Log-Entropy weighting scheme

In [7]:
result = inv_ind.search("scotland kings and thanes",log_entropy=True,cos_com = True)
for i in range(0,10):
    print(result[i])

('Macbeth', 0.07769052662351016)
('King Henry VI', 0.031869310554797074)
('King Henry IV', 0.030208435869742898)
('King Henry IV, II', 0.028145973814360462)
('King John', 0.027447611557131362)
('King Henry V', 0.027437757231046665)
('King Richard III', 0.026833074525408347)
('King Richard II', 0.02678812593487038)
('King Henry VIII', 0.02590799663639283)
("All's Well that Ends Well", 0.025433109729848902)


## Generate term by document matrix without using TF model only

In [None]:
inv_ind.generate_term_by_doc_matrix()

### Test on the same data set with the same query and print top 10 results

In [9]:
result = inv_ind.search("scotland kings and thanes",cos_com=True)
for i in range(0,10):
    print(result[i])

('King Henry V', 0.2659215354074205)
('King Henry VI', 0.2617837075388672)
('King John', 0.2472493685181954)
('King Richard II', 0.2253953514359277)
('King Lear', 0.20415867029886436)
('King Henry VIII', 0.1998881836179047)
('King Richard III', 0.18418950223762484)
('Hamlet', 0.1241932011402115)
("All's Well that Ends Well", 0.11190811971898096)
('King Henry IV', 0.10791586179996365)


## Generate term by doc matrix using the TF-IDF model

In [None]:
inv_ind.calcTFIDF()
inv_ind.generate_term_by_doc_matrix(tfidf=True)

### Test again on the same query and print the top 10 results

In [13]:
result = inv_ind.search("scotland kings and thanes",tfidf= True,cos_com=True)
for i in range(0,10):
    print(result[i])

('Macbeth', 0.08559316237351267)
('King Henry IV', 0.005789261723483593)
('King Henry VI', 0.003660436049077642)
('King Henry IV, II', 0.003121709934588564)
('King Henry V', 0.0019193400093131976)
('King Richard III', 0.0013431147327243318)
('King John', 0.0007488196759316429)
('King Richard II', 0.0006742482860831404)
('King Henry VIII', 0.0005161221482695165)
('The Comedy of Errors', 0.00046244490997942336)


## Does Log-Entropy work better than or worse than TF and TF-IDf
1. It performs better than TF since TF tends to favor longer documents since it looks at only the term frequency, thus relative global frequencies are ignored
2. Log-Entropy is calculated based on probabilities and tries to determine the surprise of seeing a term, in its calculation more factors are accounted for
such as frequency in current doc vs total frequency
3. Looking at the test data from the example showed in class we can determine that TF-IDF could be stated that is comparable to Log-Entropy if we look at the results.
They are very similar in nature and the rankings are also nearly the same only with few rotations here and there.

### Conclusion
From the results we can conclude that the Log-Entropy model represents definitely more accurate solutions than the TF model only and data would suggest slight improvement over the TF-IDF model.

# Start of part B Comparision Metrics

## Results for the "scotland kings and thanes" but using TF method with Euclidian Distance as comparison

In [14]:
inv_ind.generate_term_by_doc_matrix()
result = inv_ind.search("scotland kings and thanes",cos_com=True)
for i in range(0,10):
    print(result[i])

('King Henry V', 0.2659215354074205)
('King Henry VI', 0.2617837075388672)
('King John', 0.2472493685181954)
('King Richard II', 0.2253953514359277)
('King Lear', 0.20415867029886436)
('King Henry VIII', 0.1998881836179047)
('King Richard III', 0.18418950223762484)
('Hamlet', 0.1241932011402115)
("All's Well that Ends Well", 0.11190811971898096)
('King Henry IV', 0.10791586179996365)


## Result for "scotland kings and thanes" using TF method with Pearson Correlation comparison

In [15]:
inv_ind.generate_term_by_doc_matrix()
result = inv_ind.search("scotland kings and thanes",pear_com=True)
for i in range(0,10):
    print(result[i])

('King Henry V', 0.2659215354074205)
('King Henry VI', 0.2617837075388672)
('King John', 0.2472493685181954)
('King Richard II', 0.2253953514359277)
('King Lear', 0.20415867029886436)
('King Henry VIII', 0.1998881836179047)
('King Richard III', 0.18418950223762484)
('Hamlet', 0.1241932011402115)
("All's Well that Ends Well", 0.11190811971898096)
('King Henry IV', 0.10791586179996365)


## Results for "scotland kings and thanes" using TF method with Spearman Correlation comparison

In [16]:
inv_ind.generate_term_by_doc_matrix()
result = inv_ind.search("scotland kings and thanes",spear_com=True)
for i in range(0,10):
    print(result[i])

('King Henry V', 0.2659215354074205)
('King Henry VI', 0.2617837075388672)
('King John', 0.2472493685181954)
('King Richard II', 0.2253953514359277)
('King Lear', 0.20415867029886436)
('King Henry VIII', 0.1998881836179047)
('King Richard III', 0.18418950223762484)
('Hamlet', 0.1241932011402115)
("All's Well that Ends Well", 0.11190811971898096)
('King Henry IV', 0.10791586179996365)


## Results for "scotland kings and thanes" using TF method with Kendalltau Correlation comparison

In [17]:
inv_ind.generate_term_by_doc_matrix()
result = inv_ind.search("scotland kings and thanes",kend_com=True)
for i in range(0,10):
    print(result[i])

('King Henry V', 0.2659215354074205)
('King Henry VI', 0.2617837075388672)
('King John', 0.2472493685181954)
('King Richard II', 0.2253953514359277)
('King Lear', 0.20415867029886436)
('King Henry VIII', 0.1998881836179047)
('King Richard III', 0.18418950223762484)
('Hamlet', 0.1241932011402115)
("All's Well that Ends Well", 0.11190811971898096)
('King Henry IV', 0.10791586179996365)


## Results for "scotland kings and thanes" using TF-IDF method with Cosine comparison

In [19]:
inv_ind.calcTFIDF()
inv_ind.generate_term_by_doc_matrix(tfidf=True)
result = inv_ind.search("scotland kings and thanes",tfidf=True,cos_com=True)
for i in range(0,10):
    print(result[i])

('Macbeth', 0.08559316237351267)
('King Henry IV', 0.005789261723483593)
('King Henry VI', 0.003660436049077642)
('King Henry IV, II', 0.003121709934588564)
('King Henry V', 0.0019193400093131976)
('King Richard III', 0.0013431147327243318)
('King John', 0.0007488196759316429)
('King Richard II', 0.0006742482860831404)
('King Henry VIII', 0.0005161221482695165)
('The Comedy of Errors', 0.00046244490997942336)


## Results for "scotland kings and thanes" using TF-IDF method with Euclidian Distance comparison

In [20]:
inv_ind.calcTFIDF()
inv_ind.generate_term_by_doc_matrix(tfidf=True)
result = inv_ind.search("scotland kings and thanes",tfidf=True,euc_com=True)
for i in range(0,10):
    print(result[i])

('Macbeth', 0.08559316237351267)
('King Henry IV', 0.005789261723483593)
('King Henry VI', 0.003660436049077642)
('King Henry IV, II', 0.003121709934588564)
('King Henry V', 0.0019193400093131976)
('King Richard III', 0.0013431147327243318)
('King John', 0.0007488196759316429)
('King Richard II', 0.0006742482860831404)
('King Henry VIII', 0.0005161221482695165)
('The Comedy of Errors', 0.00046244490997942336)


## Results for  "scotland kings and thanes " using TF-IDF method with Pearson Correlation comparison

In [21]:
inv_ind.calcTFIDF()
inv_ind.generate_term_by_doc_matrix(tfidf=True)
result = inv_ind.search("scotland kings and thanes",tfidf=True,pear_com=True)
for i in range(0,10):
    print(result[i])

('Macbeth', 0.08559316237351267)
('King Henry IV', 0.005789261723483593)
('King Henry VI', 0.003660436049077642)
('King Henry IV, II', 0.003121709934588564)
('King Henry V', 0.0019193400093131976)
('King Richard III', 0.0013431147327243318)
('King John', 0.0007488196759316429)
('King Richard II', 0.0006742482860831404)
('King Henry VIII', 0.0005161221482695165)
('The Comedy of Errors', 0.00046244490997942336)


## Results for "scotland kings and thanes" using TF-IDF method with Spearman Correlation comparison

In [22]:
inv_ind.calcTFIDF()
inv_ind.generate_term_by_doc_matrix(tfidf=True)
result = inv_ind.search("scotland kings and thanes",tfidf=True,spear_com=True)
for i in range(0,10):
    print(result[i])

('Macbeth', 0.08559316237351267)
('King Henry IV', 0.005789261723483593)
('King Henry VI', 0.003660436049077642)
('King Henry IV, II', 0.003121709934588564)
('King Henry V', 0.0019193400093131976)
('King Richard III', 0.0013431147327243318)
('King John', 0.0007488196759316429)
('King Richard II', 0.0006742482860831404)
('King Henry VIII', 0.0005161221482695165)
('The Comedy of Errors', 0.00046244490997942336)


## Results for "scotland kings and thanes" using TF-IDF method with Kendalltau Correlation comparison

In [23]:
inv_ind.calcTFIDF()
inv_ind.generate_term_by_doc_matrix(tfidf=True)
result = inv_ind.search("scotland kings and thanes",tfidf=True,kend_com=True)
for i in range(0,10):
    print(result[i])

('Macbeth', 0.08559316237351267)
('King Henry IV', 0.005789261723483593)
('King Henry VI', 0.003660436049077642)
('King Henry IV, II', 0.003121709934588564)
('King Henry V', 0.0019193400093131976)
('King Richard III', 0.0013431147327243318)
('King John', 0.0007488196759316429)
('King Richard II', 0.0006742482860831404)
('King Henry VIII', 0.0005161221482695165)
('The Comedy of Errors', 0.00046244490997942336)


## Results for "scotland kings and thanes" using Log-Entropy with Cosine comparison

In [24]:
inv_ind.calcLogEntropy()
inv_ind.generate_term_by_doc_matrix(log_entropy=True)
result = inv_ind.search("scotland kings and thanes",log_entropy=True,cos_com=True)
for i in range(0,10):
    print(result[i])

('Macbeth', 0.07769052662351016)
('King Henry VI', 0.031869310554797074)
('King Henry IV', 0.030208435869742898)
('King Henry IV, II', 0.028145973814360462)
('King John', 0.027447611557131362)
('King Henry V', 0.027437757231046665)
('King Richard III', 0.026833074525408347)
('King Richard II', 0.02678812593487038)
('King Henry VIII', 0.02590799663639283)
("All's Well that Ends Well", 0.025433109729848902)


## Results for "scotland kings and thanes" using Log-Entropy with Pearson Correlation comparison

In [25]:
inv_ind.calcLogEntropy()
inv_ind.generate_term_by_doc_matrix(log_entropy=True)
result = inv_ind.search("scotland kings and thanes",log_entropy=True,pear_com=True)
for i in range(0,10):
    print(result[i])

('Macbeth', 0.07769052662351016)
('King Henry VI', 0.031869310554797074)
('King Henry IV', 0.030208435869742898)
('King Henry IV, II', 0.028145973814360462)
('King John', 0.027447611557131362)
('King Henry V', 0.027437757231046665)
('King Richard III', 0.026833074525408347)
('King Richard II', 0.02678812593487038)
('King Henry VIII', 0.02590799663639283)
("All's Well that Ends Well", 0.025433109729848902)


## Results for "scotland kings and thanes" using Log-Entropy with Spearman Correlation comparison

In [26]:
inv_ind.calcLogEntropy()
inv_ind.generate_term_by_doc_matrix(log_entropy=True)
result = inv_ind.search("scotland kings and thanes",log_entropy=True,spear_com=True)
for i in range(0,10):
    print(result[i])

('Macbeth', 0.07769052662351016)
('King Henry VI', 0.031869310554797074)
('King Henry IV', 0.030208435869742898)
('King Henry IV, II', 0.028145973814360462)
('King John', 0.027447611557131362)
('King Henry V', 0.027437757231046665)
('King Richard III', 0.026833074525408347)
('King Richard II', 0.02678812593487038)
('King Henry VIII', 0.02590799663639283)
("All's Well that Ends Well", 0.025433109729848902)


## Results for "scotland kings and thanes" using Log-Entropy with Kendalltau Correlation comparison

In [27]:
inv_ind.calcLogEntropy()
inv_ind.generate_term_by_doc_matrix(log_entropy=True)
result = inv_ind.search("scotland kings and thanes",log_entropy=True,kend_com=True)
for i in range(0,10):
    print(result[i])

('Macbeth', 0.07769052662351016)
('King Henry VI', 0.031869310554797074)
('King Henry IV', 0.030208435869742898)
('King Henry IV, II', 0.028145973814360462)
('King John', 0.027447611557131362)
('King Henry V', 0.027437757231046665)
('King Richard III', 0.026833074525408347)
('King Richard II', 0.02678812593487038)
('King Henry VIII', 0.02590799663639283)
("All's Well that Ends Well", 0.025433109729848902)


## Results from the overall experiment
1. Euclidian distance if most prone to produce out-lier results
2. All of the other methods: Spearman,Pearson,Kendall Tau produce nearly the same results as cosine comparison thus can conclude that
generally the difference should not be big between using one or the other.We can give this extreme similarity between the methods to the length of the vectors
since below we can see that on smaller vectorswe can see the difference even if it is very small. However, using such small values since our range is probably between
0 and 3, and we have lengthy vectors the difference becomes much much lower essentially no difference between the methods. 
### NOTE idk if exactly true cuz jypiter notebook bad at restarts
Euclidean seems to have outliers but maybe due to consistent offsets and long vectors then null out in the end and have the same output

In [None]:
# Testing different methods on a 2 vectors
a = [1,2,3]
b = [4,5,6]
print(inv_ind.cosine_comparison(a,b))
print(inv_ind.pearson_comparison(a,b))
print(inv_ind.spearman_comparison(a,b))
print(inv_ind.euclidian_comparison(a,b))
print(inv_ind.kendalltau_comparison(a,b))