In [2]:
from numpy import argmin
from read_pdf import *
from cli import *
from gensim.utils import simple_preprocess
from sklearn.feature_extraction.text import TfidfVectorizer
from preprocessing.TfidfTextPreprocessor import *
from query_documents_tfidf import *

In [3]:
file_paths = glob.glob('/Users/klara/Documents/Uni/bachelorarbeit/data/0/*.pdf')
docs = get_docs_from_file_paths(file_paths)

The following TFIDF model is supposed to be used to find similar documents in a large corpus and thus, ignores unique (per document) words in its vocabulary.
Moreover, it aims to keep its vocabulary reasonably small.

### Default preprocessing
This sklearn model automatically turns the text into lowercase and removes punctuation, numbers, accents, stopwords.
It does not use n-gram, n>1, since it has proven to reduce the number of dimensions in the vocabulary/ vector and moreover, it enlargens the vocabulary.
# TODO: check if it also stems/ lemmatize the words
However, it does not seem to find word groups (e.g. aa, aaa).


In [4]:
default_tfidf = TfidfVectorizer(input='content', lowercase=True, min_df=3, max_df=int(len(docs)*0.07), analyzer='word', stop_words='english', token_pattern=r'(?u)\b[A-Za-z]+\b', strip_accents='ascii')
default_tfidf.fit(docs)

In [5]:
vocab_per_preprocessor = {}
vocab_per_preprocessor['default'] = default_tfidf.get_feature_names_out()
print('vocabulary: ', vocab_per_preprocessor['default'], '\nnumber of elements of vocabulary: ', len(vocab_per_preprocessor['default']))

vocabulary:  ['TM' 'aa' 'aaa' ... 'zero' 'zl' 'zurich'] 
number of elements of vocabulary:  1641


In [6]:
print('max df of vocabulary: ', int(len(docs)*0.04))

max df of vocabulary:  7


In [7]:
default_document_term_matrix = default_tfidf.fit_transform(docs).todense()

In [8]:
# all zero tf-idf document embeddings
get_num_all_zero_tfidf_embeddings(default_document_term_matrix, file_paths)

/Users/klara/Documents/Uni/bachelorarbeit/data/0/SAC34-38.pdf is all zero
/Users/klara/Documents/Uni/bachelorarbeit/data/0/SAC86-17.pdf is all zero
number of documents with all zero tf-idf values: 2 from 195


### Custom Preprocessing
This custom preprocessing includes the following steps:
- strip accents
- remove newlines
- lowercase
- discretize numbers
- remove punctuation
- change number encoding
- remove stopwords (english)
- lemmatize
- return text from list of tokens

In [9]:
# Example of custom preprocessor
sample_text = docs[0][:100] + '12312312. Today it is sunny! 212. Today it is rainy 123.123'
print('original sample text:\n', sample_text)
preProc = TfidfTextPreprocessor()
preprocessed_text = preProc.fit_transform(sample_text)
print('\npreprocessed text:\n', preprocessed_text, '\n\n')

original sample text:
   
- THE COMMONWEALTH OF THE BAHAMAS 
The International Business Companies Act 2000; The Segregated 12312312. Today it is sunny! 212. Today it is rainy 123.123


[nltk_data] Downloading package wordnet to /Users/klara/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!



preprocessed text:
     commonwealth bahamas  international business company act - segregated + today sunny - today rainy / 




In [10]:
custom_tfidf = TfidfVectorizer(input='content', preprocessor=TfidfTextPreprocessor().fit_transform, min_df=3, max_df=int(len(docs)*0.07))
custom_document_term_matrix = custom_tfidf.fit_transform(docs).todense()

[nltk_data] Downloading package wordnet to /Users/klara/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/klara/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/klara/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/klara/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/klara/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/klara/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/klara/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/klara/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading 

In [11]:
vocab_per_preprocessor['custom'] = custom_tfidf.get_feature_names_out()
print('vocabulary: ', vocab_per_preprocessor['custom'], '\nnumber of elements of vocabulary: ', len(vocab_per_preprocessor['custom']))

vocabulary:  ['aa' 'aaa' 'ab' ... 'ze' 'zl' 'zurich'] 
number of elements of vocabulary:  1609


In [13]:
# all zero tf-idf document embeddings
get_num_all_zero_tfidf_embeddings(custom_document_term_matrix, file_paths)

/Users/klara/Documents/Uni/bachelorarbeit/data/0/SAC34-38.pdf is all zero
/Users/klara/Documents/Uni/bachelorarbeit/data/0/SAC86-17.pdf is all zero
number of documents with all zero tf-idf values: 2 from 195


### Comparison default vs custom preprocessing

In [14]:

print('The smallest vocabulary is: ', list(vocab_per_preprocessor.keys())[argmin([len(vocab_per_preprocessor[p]) for p in list(vocab_per_preprocessor.keys())])])

for p in list(vocab_per_preprocessor.keys()):
    print('# words in the vocabulary of preprocessor ', p, ' is: ', len(vocab_per_preprocessor[p]))

The smallest vocabulary is:  custom
# words in the vocabulary of preprocessor  default  is:  1641
# words in the vocabulary of preprocessor  custom  is:  1609


### General tfidf/ document (statistical) properties

In [15]:
# returns tf-idf values for the first document with token human readable, but SORTED (≠ document vectorization)
print(get_tfidf_per_doc(custom_tfidf, 0, custom_document_term_matrix))

                  tfidf
innovatis      0.522769
classof        0.261385
authorisation  0.249461
nonregistered  0.239719
ere            0.239719
...                 ...
eo             0.000000
envelope       0.000000
entitlement    0.000000
entirety       0.000000
zurich         0.000000

[1609 rows x 1 columns]


In [18]:
# document search engine using TF-IDF and cosine similarity
transformed_query = print_tfidf_transformation_example(tfidf=custom_tfidf, query='human readable Bahamas credit system')    # format: (doc_no, token_no) tfidf value
 
print_cosine_similarity_examples(transformed_query=transformed_query, document_term_matrix=custom_tfidf.fit_transform(docs))

[nltk_data] Downloading package wordnet to /Users/klara/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/klara/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/klara/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/klara/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/klara/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/klara/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/klara/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/klara/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading 

transformation to (document number, token encoding) tf-idf score
   (0, 383)	1.0


[nltk_data] Downloading package wordnet to /Users/klara/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/klara/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/klara/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/klara/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/klara/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/klara/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/klara/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/klara/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading 

similarity between documents in trainings corpus:
 [[1.         0.         0.         ... 0.         0.14857595 0.        ]
 [0.         1.         0.         ... 0.03029976 0.         0.        ]
 [0.         0.         1.         ... 0.00956945 0.00998179 0.        ]
 ...
 [0.         0.03029976 0.00956945 ... 1.         0.         0.        ]
 [0.14857595 0.         0.00998179 ... 0.         1.         0.0443241 ]
 [0.         0.         0.         ... 0.         0.0443241  1.        ]]
similarity between documents in trainings corpus and query:
 [[0.        ]
 [0.        ]
 [0.        ]
 [0.01368875]
 [0.        ]
 [0.        ]
 [0.        ]
 [0.        ]
 [0.        ]
 [0.        ]
 [0.        ]
 [0.31801069]
 [0.        ]
 [0.        ]
 [0.15647429]
 [0.        ]
 [0.        ]
 [0.        ]
 [0.        ]
 [0.        ]
 [0.        ]
 [0.        ]
 [0.        ]
 [0.        ]
 [0.        ]
 [0.        ]
 [0.        ]
 [0.        ]
 [0.        ]
 [0.        ]
 [0.        ]
 [0.      

[nltk_data] Downloading package wordnet to /Users/klara/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/klara/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/klara/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/klara/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/klara/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/klara/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/klara/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/klara/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading 