In [63]:
import cohere
import pandas as pd
from typing import List,Tuple
import numpy as np
from numpy import dot
from numpy.linalg import norm



In [64]:
"""
    Model choices are:-
    1. large - length of embeddings per token is 4096
    2. small - length of embeddings per token is 1024
    3. multilingual-22-12
    """
co = cohere.Client("7lUDtMMSa1bVCEVIKEOms0jPImRnselfUQucOH5v")
    
response = co.embed(
  model='small',
  texts=["Milkshake"])
# print('Embeddings: {}'.format(response.embeddings))

In [None]:
# print(len(response.embeddings[0]))

In [None]:
def get_cos_sim(text: str) -> List[Tuple[str, float]]:
    """Return cosine similarity scores (sorted in descending order) of corpus documents with input text.

    Args:
        text (str): Input text to be compared against corpus.

    Returns:
        List[Tuple[str, float]]: Corpus documents and cosine similarity scores, sorted in descending order.
    """
    res = [
        ("Patent_A", 0.8),
        ("Patent_B", 0.7),
        ("Patent_C", 0.6),
    ]
    return res


def get_patent_desc(text: str) -> str:
    """Generate new patent description.

    Args:
        text (str): Input text.

    Returns:
        str: New patent description.
    """
    res = "Description of new patent."
    return res

In [None]:
datafile = pd.read_csv("sample.tsv",sep="\t")
corpus_set = datafile.iloc[:1]
test_set = datafile.iloc[1:]
corpus_abstracts = corpus_set["ABSTRACT"]
test_abstract = test_set["ABSTRACT"]

corpus_text = list(corpus_abstracts)
test_text = list(test_abstract)
# abstracts = datafile["ABSTRACT"]
# abstract_list = list(abstracts)
# print(abstract_list)
# datafile.head(5)

print(corpus_abstracts[0])
print(test_abstract[0])

In [None]:
test_results = co.embed(texts=test_text,model="small")
corpus_results = co.embed(texts=corpus_text,model="small")


In [None]:
test_embeddings = test_results.embeddings
corpus_embeddings = corpus_results.embeddings



In [None]:
print(test_embeddings)

In [None]:
test_embeddings_arr = np.array(test_embeddings)
corpus_embeddings_arr = np.array(corpus_embeddings)
print(test_embeddings_arr.shape)
print(corpus_embeddings_arr.shape)


In [None]:
# reshaped_test_embeddings = np.reshape(test_embeddings,(1024,-1))
# reshaped_test_embeddings.shape
# reshaped_corpus_embeddings = np.reshape(corpus_embeddings,(1024,-1))

In [None]:
cosine_similarities = []
for each in corpus_embeddings_arr:
    cos_sim = dot(test_embeddings_arr, each)/(norm(test_embeddings_arr)*norm(each))
    cosine_similarities.append(cos_sim)

In [None]:
print(cosine_similarities)

In [None]:
# [array([0.38826797]), array([0.6634463]), array([0.4280255]), array([0.31543467]), array([0.40113314]), array([0.22327371]), array([0.35148772]), array([0.48820173]), array([0.69228912]), array([0.30005143]), array([0.18081221]), array([0.69498284]), array([0.52173298]), array([0.40174261]), array([0.6032165]), array([0.46532465]), array([0.55992382]), array([0.48836531]), array([0.60111407]), array([0.18527497]), array([0.42137111])]
data = pd.read_csv("sample.tsv",sep="\t")
cor = data.to_dict(orient="records")
print(cor[1][])

# for each in cor.items():
#     print(each)

In [59]:
class PaperParser():
    def __init__(self,corpus_text=None,model="small") -> None:
        self.model = model
        self.models = ["small","large","multilingual-22-12"]
        self.client = cohere.Client("7lUDtMMSa1bVCEVIKEOms0jPImRnselfUQucOH5v")
        self.corpus = None
        # self.corpus_embeddings = co.embed(texts=corpus_text,model=model).embeddings
    

    def create_corpus(self,datafile,file_type="csv"):
        if type != "csv":
            data = pd.read_csv(datafile,sep="\t")
        else:
            data = pd.read_csv(datafile)
        corpus = {}
        corpus_records = data.to_dict(orient="records")

        for idx,record in enumerate(corpus_records):
            corpus[idx] = record
        self.corpus = corpus

    def get_corpus_embeddings(self):
        # indices are as follows:
        # 0 : dictionary index
        # 1 : Title
        # 2 : Language
        # 3 : Abstract
        # 4 : URL
        corpus_texts = []
        for idx in self.corpus:
            corpus_texts.append(self.corpus[idx]["ABSTRACT"])
        if len(corpus_texts) > 16:
            pass
        corpus_embeddings = self.client.embed(texts=corpus_texts,model=self.model).embeddings
        for idx,embedding in enumerate(corpus_embeddings):
            self.corpus[idx]["EMBEDDING"] = embedding
        

    def get_cos_sim(self,text: str) -> List[Tuple[str, float]]:
        """Return cosine similarity scores (sorted in descending order) of corpus documents with input text.

        Args:
            text (str): Input text to be compared against corpus.

        Returns:
            List[Tuple[str, float]]: Corpus documents and cosine similarity scores, sorted in descending order.
        """
        # Get embedding
        text_embeddings = self.client.embed(texts=text,model=self.model).embeddings
        # Get cosine similarities
        res = []
        for record in self.corpus.items():
            cos_sim = dot(text_embeddings, record[1]["EMBEDDING"])/(norm(text_embeddings)*norm(record[1]["EMBEDDING"]))
            res.append((record[1]["TITLE"],float(cos_sim)))
        res.sort(key=lambda a:a[1],reverse=True)

        return res

In [60]:
test_datafile = pd.read_csv("test.tsv",sep="\t")
test_abstract = test_datafile["ABSTRACT"]
test_text = list(test_abstract)

parser = PaperParser()
parser.create_corpus("corpus.tsv",file_type="tsv")
parser.get_corpus_embeddings()

cos_sim_list = parser.get_cos_sim(text=test_text)
# parser.corpus

In [61]:
print(cos_sim_list)

[('Farming and gardening tools with two sets of tines', 0.6949828408172631), ('Handle for household and gardening tools', 0.6922891237944154), ('Gardening tool with multiple interchangeable tool heads', 0.6634463023971804), ('Electrical gardening tool with a replaceable working piece', 0.6032164990479286), ('Gardening tool assembled of pointed metal element and wooden stick, comprising fixing unit with inner and outer sleeve', 0.6011140747230448), ('Handle of gardening tool', 0.559923822218539), ('Portable motorised device for gardening tools', 0.521732982296373), ('Ergonomic garden tools', 0.48836531306215786), ('Short-handled, ergonomic garden tools', 0.4882017258395994), ('Gardening tool', 0.46532464809791063), ('Placing rack for gardening tools', 0.42802549944776136), ('Motorized gardening tool', 0.4213711144810016), ('Garden tool', 0.4017426116582123), ('Multifunctional Gardening Tool', 0.4011331428243973), ('Gardening tool', 0.3882679740254629), ('园林工具刹车装置', 0.351487717413807), (

In [None]:
res = [("Patent_B", 0.7),("Patent_C", 0.6),("Patent_A", 0.8)]
res.sort(key=lambda a:a[1],reverse=True)
res

In [76]:
response = co.generate(  
    model='xlarge',  
    prompt = "Write me a 500 word patent application for a gardening tool",  
    max_tokens=40,  
    temperature=0.2,  
    stop_sequences=["--"])

startup_idea = response.generations

In [77]:
# for idea in startup_idea:
#     print(idea.text)
print(startup_idea)

[cohere.Generation {
	text: .
I need a patent application for a gardening tool. The patent application should be 500 words.
I will provide the details.
I need a patent application for a gardening tool. The patent
	likelihood: None
	token_likelihoods: None
}]
