In [4]:
from sentence_transformers import SentenceTransformer


In [2]:
doc = """The popularity of social media, online forums, and review websites has significantly contributed to the 
generation of a vast amount of textual data. Understanding and analyzing this data is crucial for multiple 
applications such as sentiment analysis, chatbot development, content moderation, and opinion mining. 
Among various complex tasks associated with text analytics, detecting sarcasm remains an arduous 
undertaking due to its ironic nature where words portray the opposite intended meaning. One often utilizes 
sarcasm as a means to express mirth, censure or disdain. A precise detection of sarcastic intent in written 
communication is pivotal for an accurate interpretation of the conveyed message.
Detecting sarcasm in text is a complex endeavor, as it hinges on many contextual factors and subtle 
linguistic clues that automated programs have difficulty interpreting. Moreover, sarcasm can exhibit marked 
variations across cultural and linguistic groups, posing additional challenges for modern natural language 
processing (NLP) tools. As large-scale textual data analysis attains greater prominence across various 
domains, developing accurate and effective models of sarcasm detection has become an increasingly 
pressing need"""

In [12]:
document = doc.split('.')
document

['The popularity of social media, online forums, and review websites has significantly contributed to the \ngeneration of a vast amount of textual data',
 ' Understanding and analyzing this data is crucial for multiple \napplications such as sentiment analysis, chatbot development, content moderation, and opinion mining',
 ' \nAmong various complex tasks associated with text analytics, detecting sarcasm remains an arduous \nundertaking due to its ironic nature where words portray the opposite intended meaning',
 ' One often utilizes \nsarcasm as a means to express mirth, censure or disdain',
 ' A precise detection of sarcastic intent in written \ncommunication is pivotal for an accurate interpretation of the conveyed message',
 '\nDetecting sarcasm in text is a complex endeavor, as it hinges on many contextual factors and subtle \nlinguistic clues that automated programs have difficulty interpreting',
 ' Moreover, sarcasm can exhibit marked \nvariations across cultural and linguistic g

In [13]:
model = SentenceTransformer('all-mpnet-base-v2',device='cuda')

In [14]:
embeddings = model.encode(document)


In [15]:
print("Max Sequence Length:", model.max_seq_length)


Max Sequence Length: 384


In [16]:
embeddings.shape

(8, 768)

In [17]:
print(embeddings)

[[ 0.05302034  0.05951107 -0.03653507 ... -0.01434155 -0.03615366
  -0.00701014]
 [ 0.07619365 -0.00084576 -0.04186951 ... -0.00203631 -0.00712318
  -0.00228361]
 [ 0.07875723 -0.00470723 -0.06173111 ...  0.02767393  0.00067155
  -0.00589097]
 ...
 [ 0.05349918 -0.0090595  -0.05768472 ...  0.03465624  0.00537337
  -0.01348786]
 [ 0.05503198  0.02232661 -0.05980028 ...  0.02945515  0.01413232
  -0.04283253]
 [ 0.07154523  0.0071794  -0.05959034 ...  0.02161732 -0.02539712
  -0.01456877]]


In [18]:
import pinecone

index_name = 'semantic-search-openai'

# initialize connection to pinecone (get API key at app.pinecone.io)
pinecone.init(
    api_key="eeeb80d1-8f75-4ea0-b0ab-b70330f96db3",
    environment="us-east1-gcp"  # find next to api key in console
)
# check if 'openai' index already exists (only create index if not)
if index_name not in pinecone.list_indexes():
    pinecone.create_index(index_name, dimension=len(embeddings[0]))
# connect to index
index = pinecone.Index(index_name)

In [25]:
from tqdm.auto import tqdm
model = SentenceTransformer('all-mpnet-base-v2',device='cuda')

count = 0  # we'll use the count to create unique IDs
batch_size = 32  # process everything in batches of 32
for i in tqdm(range(0, len(document), batch_size)):
    # set end position of batch
    i_end = min(i+batch_size, len(document))
    # get batch of lines and IDs
    lines_batch = document[i: i+batch_size]
    ids_batch = [str(n) for n in range(i, i_end)]
    # create embeddings
    embeds =model.encode(lines_batch)
    # prep metadata and upsert batch
    meta = [{'text': line} for line in lines_batch]
    to_upsert = zip(ids_batch, embeds.tolist(), meta)
    # upsert to Pinecone
    index.upsert(vectors=list(to_upsert))

  0%|          | 0/1 [00:00<?, ?it/s]

In [28]:
query = "sarcasam detection"

In [29]:
query = model.encode(query)
res = index.query([query.tolist()], top_k=5, include_metadata=True)
res

{'matches': [{'id': '4',
              'metadata': {'text': ' A precise detection of sarcastic intent '
                                   'in written \n'
                                   'communication is pivotal for an accurate '
                                   'interpretation of the conveyed message'},
              'score': 0.18849954,
              'values': []},
             {'id': '5',
              'metadata': {'text': '\n'
                                   'Detecting sarcasm in text is a complex '
                                   'endeavor, as it hinges on many contextual '
                                   'factors and subtle \n'
                                   'linguistic clues that automated programs '
                                   'have difficulty interpreting'},
              'score': 0.145075634,
              'values': []},
             {'id': '2',
              'metadata': {'text': ' \n'
                                   'Among various complex tasks a

In [32]:

import os
import PyPDF2
from docx import Document

def extract_data(file_path):
    file_extension = os.path.splitext(file_path)[1]
    print(file_extension)

    if file_extension == ".txt":
        with open(file_path, "r") as file:
            content = file.readlines()
    elif file_extension == ".pdf":
        pdf_reader = PyPDF2.PdfFileReader(file_path)
        content = []
        for page_num in range(pdf_reader.numPages):
            page = pdf_reader.getPage(page_num)
            text = page.extractText()
            lines = text.split('\n')
            content.extend(lines)
    elif file_extension == ".docx":
        doc = Document(file_path)
        content = [paragraph.text for paragraph in doc.paragraphs]
    else:
        content = []
        print("Unsupported file format.")

    print(content)

if __name__ == "__main__":
    file_path = input("Research paper full with lag")
    extract_data(file_path)


Unsupported file format.
[]


In [40]:
for i  in res['matches']:
    print(i["metadata"]['text'])

 A precise detection of sarcastic intent in written 
communication is pivotal for an accurate interpretation of the conveyed message

Detecting sarcasm in text is a complex endeavor, as it hinges on many contextual factors and subtle 
linguistic clues that automated programs have difficulty interpreting
 
Among various complex tasks associated with text analytics, detecting sarcasm remains an arduous 
undertaking due to its ironic nature where words portray the opposite intended meaning
 Understanding and analyzing this data is crucial for multiple 
applications such as sentiment analysis, chatbot development, content moderation, and opinion mining
 As large-scale textual data analysis attains greater prominence across various 
domains, developing accurate and effective models of sarcasm detection has become an increasingly 
pressing need
