In [292]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [293]:
import os, sys
import numpy as np
from pathlib import Path as pathl
from pdf_parser import pipeline

In [294]:
import spacy
import pickle
nlp = spacy.load('en_core_web_sm')
punctuation = '!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~\n'

In [295]:
#append relevant file paths
new_path = pathl('.')
parent = new_path.resolve().parent
sys.path.append(str(parent))

In [296]:
current_dir = os.walk(sys.path[-1] + '/Data/downloaded_files')
files = []
for file in current_dir:
    files.append(file[-1])
files = sum(files, [])

In [301]:
data_path = parent / 'Data/downloaded_files'
def get_text(filename):
    'Return text from a filename'
    pdf_file = data_path / filename
    text_dict = pipeline(filepath = str(pdf_file))
    text = list(text_dict.values())
    text = sum(text, [])
    text = [sentence.strip() for sentence in text]
    text = ' '.join(text)
    return text

def lemmatizer(text):
    'Lemmatizes text'
    doc = nlp.pipe(text)
    lemmatized = []
    for sentence in doc:
        sent = []
        for word in sentence:
            if str(word) in punctuation:
                continue
            lemma = word.lemma_.strip() 
            sent.append(lemma)
        
        lemmatized.append(' '.join(sent))
    return lemmatized

In [308]:
vectorizer = TfidfVectorizer(min_df=1)
corpus = []

for file in files[:10]:
    if file[-3:] == 'pdf':
        text = get_text(file)
        text = text.split('.')
        lemmatized = lemmatizer(text)
        corpus.append(' '.join(lemmatized))



In [310]:
len(corpus)

10

In [392]:
from itertools import combinations_with_replacement

In [345]:
corpus = [i for i in corpus if len(i) > 1]

In [346]:
model = vectorizer.fit_transform(corpus)
dense = model.todense()

In [403]:
len(dense)

6

In [437]:
similarity = [cosine_similarity(model_1, model_2) for model_1, model_2 in list(combinations_with_replacement(dense, 2))]
similarity = [similarity[i][0][0] for i in range(len(similarity))]

In [438]:
similarity = np.array(similarity)
similarity = np.round(similarity, 2)

In [439]:
similarity_of_pdfs = list(zip(list(combinations_with_replacement(range(0,6), 2)), similarity))

In [440]:
similarity_of_pdfs

[((0, 0), 1.0),
 ((0, 1), 0.68),
 ((0, 2), 0.55),
 ((0, 3), 0.55),
 ((0, 4), 0.6),
 ((0, 5), 0.69),
 ((1, 1), 1.0),
 ((1, 2), 0.6),
 ((1, 3), 0.43),
 ((1, 4), 0.55),
 ((1, 5), 0.62),
 ((2, 2), 1.0),
 ((2, 3), 0.37),
 ((2, 4), 0.46),
 ((2, 5), 0.55),
 ((3, 3), 1.0),
 ((3, 4), 0.5),
 ((3, 5), 0.53),
 ((4, 4), 1.0),
 ((4, 5), 0.54),
 ((5, 5), 1.0)]