In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [25]:
import os, sys
from pathlib import Path as pathl
from pdf_parser import pipeline

In [39]:
import spacy
nlp = spacy.load('en_core_web_sm')
punctuation = '!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~\n'

In [40]:
#append relevant file paths
new_path = pathl('.')
parent = new_path.resolve().parent
sys.path.append(str(parent))

In [41]:
current_dir = os.walk(sys.path[-1] + '/Data')
files = []
for file in current_dir:
    files.append(file[-1])
files = sum(files, [])

In [42]:
data_path = parent / 'Data'
def get_text(filename):
    'Return text from a filename'
    pdf_file = data_path / filename
    text_dict = pipeline(filepath = str(pdf_file))
    text = list(text_dict.values())
    text = sum(text, [])
    text = [sentence.strip() for sentence in text]
    text = ' '.join(text)
    return text

def lemmatizer(text):
    'Lemmatizes text'
    doc = nlp.pipe(text)
    lemmatized = []
    for sentence in doc:
        sent = []
        for word in sentence:
            if str(word) in punctuation:
                continue
            lemma = word.lemma_.strip() 
            sent.append(lemma)
        
        lemmatized.append(' '.join(sent))
    return lemmatized

In [70]:
vectorizer = TfidfVectorizer(min_df=1)
corpus = []
for file in files:
    if file[-3:] == 'pdf':
        text = get_text(file)
        text = text.split('.')
        lemmatized = lemmatizer(text)
        corpus.append(' '.join(lemmatized))

In [71]:
model = vectorizer.fit_transform(corpus)
dense = model.todense()

In [72]:
cosine_similarity(model[0], model[1])

array([[0.62116493]])

In [73]:
cosine_similarity(model[0], model[2])

array([[0.64637026]])

In [74]:
cosine_similarity(model[1], model[2])

array([[0.64236199]])