In [2]:
import pandas as pd
import numpy as np
import torch
from datasets import Dataset

from transformers import AutoTokenizer, AutoModel

model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [3]:
class Embedding:
    #CLS is a special classification token and the last hidden state of BERT Embedding
    def cls_pooling(self, model_output):
        return model_output.last_hidden_state[:, 0]

    #BERT tokenizer of input text
    def get_embeddings(self, text_list):
        encoded_input = tokenizer(
            text_list, padding=True, truncation=True, return_tensors="pt"
        )
        encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
        model_output = model(**encoded_input)
        return self.cls_pooling(model_output).cpu().detach().numpy()

In [16]:


class Faiss:
    def __init__(self):
        pass
    
    #convert dataset into embeddings dataset to run FAISS
    def makeEmbeddings(self,dataset,df=None):
        embeddings = []
        count = 0
        for data in dataset:
            print(count)
            embeddings.append(Embedding().get_embeddings(data)[0])
            count += 1
        embeddings_dataset = pd.DataFrame(
            {
                "embeddings":embeddings,
                "values":dataset
            })
        if (df is not None):
            embeddings_dataset = pd.concat([df, embeddings_dataset], axis=1)
        embeddings_dataset = Dataset.from_pandas(embeddings_dataset)
        return embeddings_dataset
    
    # run faiss model on dataset
    def faiss(self,embeddings_dataset):
        embeddings_dataset.add_faiss_index(column="embeddings")
    
    #get query embedding
    def getQueryEmbedding(self, query):
        return Embedding().get_embeddings([query])
        
    #predict 4 nearest neighbors
    def predict(self,query,embeddings_dataset,k=4):
        query_embedding = self.getQueryEmbedding(query)
        scores, samples = embeddings_dataset.get_nearest_examples("embeddings", query_embedding, k=5)
        samples = pd.DataFrame(samples)
        samples["scores"] = scores
        return samples
        

In [66]:
from PyPDF2 import PdfReader

# open the pdf file
reader = PdfReader("combined.pdf")

# get number of pages
num_pages = len(reader.pages)

In [71]:
x = "Copyright 2012 Cengage Learning. All Rights Reserved. May not be copied, scanned, or duplicated, in whole or in part. Due to electronic rights, some third party content may be suppressed from the eBook and/or eChapter(s). Editorial review has deemed that any suppressed content does not materially affect the overall learning experience. Cengage Learning reserves the right to remove additional content at any time if subsequent rights restrictions require it."
dataset = [page.extract_text() for page in reader.pages]
df = pd.DataFrame({"page":list(range(1,194))})


In [72]:
f = Faiss()
embeddings_dataset = f.makeEmbeddings(dataset,df)
embeddings_dataset

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192


Dataset({
    features: ['page', 'embeddings', 'values'],
    num_rows: 193
})

In [74]:
f.faiss(embeddings_dataset)
samples = f.predict("""Classify whether each of the following languages is decidable, recognizable-but-undecidable, or
unrecognizable. Prove your answer. For your proofs, you can assume that the problem ATM =
{⟨M, w⟩ | M is a TM and accepts w} is undecidable.
1. P1 = { ⟨M⟩ | M is a TM and M accepts at least 2 strings }.
2. P2 = { ⟨M⟩ | M is a TM and there is no string w such that M accepts w and w is a palindrome }.""",embeddings_dataset)
samples

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,page,embeddings,values,scores
0,165,"[0.039218176156282425, -0.7150614857673645, -0...",Another Problem for TMs\n21P = { <M> | M is a ...,23.291431
1,144,"[0.031310174614191055, -0.34490180015563965, -...","Recap: Known Facts\n23If A is decidable, then ...",24.443407
2,143,"[0.0638505071401596, -0.6105973124504089, -0.1...","Non- acceptance Problem for TMs\n22ATM = { <M,...",24.575928
3,148,"[0.08690290153026581, -0.5814269781112671, -0....","Halting Problem for TMs\n4HALTTM = { <M,w> | M...",25.426054
4,141,"[0.03824932500720024, -0.2070421278476715, -0....",Problem Classification\n20DECIDABLERECOGNIZABL...,25.609194


In [70]:
samples["values"][0]

'Another Problem for TMs\n21P = { <M> | M is a TM and intersection of L(M) and 0* is empty  } \nGiven a TM, check that it does not accept any string with only 0’sIs this problem:\n decidable, recognizable -but-undecidable, unrecognizable ??\nConsider the complement:~P: Given TM M accepts some string with only 0’s\nClaim: ~P is recognizable but undecidableFollows that P is unrecognizable'

In [32]:
embeddings_dataset = embeddings_dataset.map(lambda x: {**x, 'C': list(range(100, 121))})

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [33]:
embeddings_dataset.add_faiss_index(column="embeddings")

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['embeddings', 'values', 'C'],
    num_rows: 20
})

In [53]:
pdf_files = ["B"+str(x)+".pdf" for x in list(range(1,9))]
pdf_files

['B1.pdf',
 'B2.pdf',
 'B3.pdf',
 'B4.pdf',
 'B5.pdf',
 'B6.pdf',
 'B7.pdf',
 'B8.pdf']

In [65]:
import PyPDF2

# Create a PDF writer object to write the combined PDF
pdf_writer = PyPDF2.PdfWriter()

# Loop through the list of PDF files and add each page to the writer
for pdf_file in pdf_files:
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    for page in pdf_reader.pages:
        pdf_writer.add_page(page)

# Create and save the combined PDF
combined_pdf = 'combined.pdf'  # Replace with the desired output file name
with open(combined_pdf, 'wb') as output_pdf:
    pdf_writer.write(output_pdf)

In [59]:
cd ..


/Users/juliasusser/Desktop
