# Importing the necessery libraries needed

In [None]:
from inverted_index import InvertedIndex
import nltk
from utils import read_data
nltk.download('stopwords')
inv_ind = InvertedIndex()

# Initialization done

## We will now proceed to read the documents from the data folder

In [None]:
documents = read_data("./shakespeare")
# Print the first 1 documents
print(documents[1])

## Print the number of the documents as well as their document title

In [None]:
print(len(documents))
for i in documents:
    # Print document title
    print(i[0])

## Add documents to the inverted matrix

In [None]:
for i in documents:
    # Add document to inverted index
    inv_ind.add_document(i)

## Print come descriptives so that we can verify everything works

In [None]:
print(inv_ind.get_total_terms())
print(inv_ind.get_total_docs())

## Generate a term by document matrix using log entropy

## Explanation of Log-Entropy and the 2 components of it
Log-Entropy is a statistical analysis of probabilities and calculation of a surprise "index" when certain event occurs. For example 
if a certain event has 90% chance to occur then the Log-Entropy of that event will be low since the surprise factor will be low.

### Component 1
1. This is the logarithm of the term frequency of i in document j. The term frequency can be better described as the probability of term i to occur in document j.
Since this term occurs multiple times throughout one document (potentially or 0) we will need to multiply the natural log of it to the next component.

### Component 2
2. The second term can be interpreted as the actual amount of surprise given a discrete variable X and it's probability P(X). In order to compute the surprise
we need the frequency of the term in the current document in regards to the total fequency.This number will of course be less than 1 and can be interpreted as yet another probability of occurance of the discrete variable X or in our case the term. We then divide by the log of the total number of documents and this final value tha we would have would represent the surprise of occurance of term i. If surprise is low then probability was high, if surprise is high then probability was low.


In [None]:
inv_ind.calcLogEntropy()
inv_ind.generate_term_by_doc_matrix(log_entropy=True)

### Perform a search query: "scotland kings and thanes" using the Log-Entropy weighting scheme

In [None]:
result = inv_ind.search("scotland kings and thanes",log_entropy=True,cos_com = True)
for i in range(0,10):
    print(result[i])

## Generate term by document matrix without using TF model only

In [None]:
inv_ind.generate_term_by_doc_matrix()

### Test on the same data set with the same query and print top 10 results

In [None]:
result = inv_ind.search("scotland kings and thanes",cos_com=True)
for i in range(0,10):
    print(result[i])

## Generate term by doc matrix using the TF-IDF model

In [None]:
inv_ind.calcTFIDF()
inv_ind.generate_term_by_doc_matrix()

### Test again on the same query and print the top 10 results

In [None]:
result = inv_ind.search("scotland kings and thanes",tfidf= True,cos_com=True)
for i in range(0,10):
    print(result[i])

## Does Log-Entropy work better than or worse than TF and TF-IDf
1. It performs better than TF since TF tends to favor longer documents since it looks at only the term frequency, thus relative global frequencies are ignored
2. Log-Entropy is calculated based on probabilities and tries to determine the surprise of seeing a term, in its calculation more factors are accounted for
such as frequency in current doc vs total frequency
3. Looking at the test data from the example showed in class we can determine that TF-IDF could be stated that is comparable to Log-Entropy if we look at the results.
They are very similar in nature and the rankings are also nearly the same only with few rotations here and there.

### Conclusion
From the results we can conclude that the Log-Entropy model represents definitely more accurate solutions than the TF model only and data would suggest slight improvement over the TF-IDF model.

# Start of part B Comparision Metrics

## Results for the "scotland kings and thanes" but using TF method with Euclidian Distance as comparison

In [None]:
inv_ind.generate_term_by_doc_matrix()
result = inv_ind.search("scotland kings and thanes",cos_com=True)
for i in range(0,10):
    print(result[i])

## Result for "scotland kings and thanes" using TF method with Pearson Correlation comparison

In [None]:
inv_ind.generate_term_by_doc_matrix()
result = inv_ind.search("scotland kings and thanes",pear_com=True)
for i in range(0,10):
    print(result[i])

## Results for "scotland kings and thanes" using TF method with Spearman Correlation comparison

In [None]:
inv_ind.generate_term_by_doc_matrix()
result = inv_ind.search("scotland kings and thanes",spear_com=True)
for i in range(0,10):
    print(result[i])

## Results for "scotland kings and thanes" using TF method with Kendalltau Correlation comparison

In [None]:
inv_ind.generate_term_by_doc_matrix()
result = inv_ind.search("scotland kings and thanes",kend_com=True)
for i in range(0,10):
    print(result[i])

## Results for "scotland kings and thanes" using TF-IDF method with Cosine comparison

In [None]:
inv_ind.calcTFIDF()
inv_ind.generate_term_by_doc_matrix()
result = inv_ind.search("scotland kings and thanes",tfidf=True,cos_com=True)
for i in range(0,10):
    print(result[i])

## Results for "scotland kings and thanes" using TF-IDF method with Euclidian Distance comparison

In [None]:
inv_ind.calcTFIDF()
inv_ind.generate_term_by_doc_matrix()
result = inv_ind.search("scotland kings and thanes",tfidf=True,cos_com=True)
for i in range(0,10):
    print(result[i])

## Results for  "scotland kings and thanes " using TF-IDF method with Pearson Correlation comparison

In [None]:
inv_ind.calcTFIDF()
inv_ind.generate_term_by_doc_matrix()
result = inv_ind.search("scotland kings and thanes",tfidf=True,pear_com=True)
for i in range(0,10):
    print(result[i])

## Results for "scotland kings and thanes" using TF-IDF method with Spearman Correlation comparison

In [None]:
inv_ind.calcTFIDF()
inv_ind.generate_term_by_doc_matrix()
result = inv_ind.search("scotland kings and thanes",tfidf=True,spear_com=True)
for i in range(0,10):
    print(result[i])

## Results for "scotland kings and thanes" using TF-IDF method with Kendalltau Correlation comparison

In [None]:
inv_ind.calcTFIDF()
inv_ind.generate_term_by_doc_matrix()
result = inv_ind.search("scotland kings and thanes",tfidf=True,kend_com=True)
for i in range(0,10):
    print(result[i])

## Results for "scotland kings and thanes" using Log-Entropy with Cosine comparison

In [None]:
inv_ind.calcLogEntropy()
inv_ind.generate_term_by_doc_matrix()
result = inv_ind.search("scotland kings and thanes",log_entropy=True,cos_com=True)
for i in range(0,10):
    print(result[i])

## Results for "scotland kings and thanes" using Log-Entropy with Pearson Correlation comparison

In [None]:
inv_ind.calcLogEntropy()
inv_ind.generate_term_by_doc_matrix()
result = inv_ind.search("scotland kings and thanes",log_entropy=True,pear_com=True)
for i in range(0,10):
    print(result[i])

## Results for "scotland kings and thanes" using Log-Entropy with Spearman Correlation comparison

In [None]:
inv_ind.calcLogEntropy()
inv_ind.generate_term_by_doc_matrix()
result = inv_ind.search("scotland kings and thanes",log_entropy=True,spear_com=True)
for i in range(0,10):
    print(result[i])

## Results for "scotland kings and thanes" using Log-Entropy with Spearman Correlation comparison

In [None]:
inv_ind.calcLogEntropy()
inv_ind.generate_term_by_doc_matrix()
result = inv_ind.search("scotland kings and thanes",log_entropy=True,kend_com=True)
for i in range(0,10):
    print(result[i])

## Results from the overall experiment
1. Euclidian distance if most prone to produce out-lier results
2. All of the other methods: Spearman,Pearson,Kendall Tau produce nearly the same results as cosine comparison thus can conclude that
generally the difference should not be big between using one or the other. 
### NOTE idk if exactly true cuz jypiter notebook bad at restarts
Euclidean seems to have outliers but maybe due to consistent offsets and long vectors then null out in the end and have the same output

In [None]:
# Testing different methods on a 2 vectors
a = [1,2,3]
b = [4,5,6]
print(inv_ind.cosine_comparison(a,b))
print(inv_ind.pearson_comparison(a,b))
print(inv_ind.spearman_comparison(a,b))
print(inv_ind.euclidian_comparison(a,b))
print(inv_ind.kendalltau_comparison(a,b))