In [1]:
import cohere
import pandas as pd
from typing import List,Tuple
import numpy as np
from numpy import dot
from numpy.linalg import norm
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
"""
    Model choices are:-
    1. large - length of embeddings per token is 4096
    2. small - length of embeddings per token is 1024
    3. multilingual-22-12
    """
co = cohere.Client("7lUDtMMSa1bVCEVIKEOms0jPImRnselfUQucOH5v")
    
response = co.embed(
  model='small',
  texts=["Milkshake"])
# print('Embeddings: {}'.format(response.embeddings))

In [3]:
# print(len(response.embeddings[0]))

In [4]:
def get_cos_sim(text: str) -> List[Tuple[str, float]]:
    """Return cosine similarity scores (sorted in descending order) of corpus documents with input text.

    Args:
        text (str): Input text to be compared against corpus.

    Returns:
        List[Tuple[str, float]]: Corpus documents and cosine similarity scores, sorted in descending order.
    """
    res = [
        ("Patent_A", 0.8),
        ("Patent_B", 0.7),
        ("Patent_C", 0.6),
    ]
    return res


def get_patent_desc(text: str) -> str:
    """Generate new patent description.

    Args:
        text (str): Input text.

    Returns:
        str: New patent description.
    """
    res = "Description of new patent."
    return res

In [5]:
# datafile = pd.read_csv("sample.tsv",sep="\t")
# corpus_set = datafile.iloc[:1]
# test_set = datafile.iloc[1:]
# corpus_abstracts = corpus_set["ABSTRACT"]
# test_abstract = test_set["ABSTRACT"]

# corpus_text = list(corpus_abstracts)
# test_text = list(test_abstract)
# # abstracts = datafile["ABSTRACT"]
# # abstract_list = list(abstracts)
# # print(abstract_list)
# # datafile.head(5)

# print(corpus_abstracts[0])
# print(test_abstract[0])

In [6]:
# test_results = co.embed(texts=test_text,model="small")
# corpus_results = co.embed(texts=corpus_text,model="small")


In [7]:
# test_embeddings = test_results.embeddings
# corpus_embeddings = corpus_results.embeddings



In [8]:
# print(test_embeddings)

In [9]:
# test_embeddings_arr = np.array(test_embeddings)
# corpus_embeddings_arr = np.array(corpus_embeddings)
# print(test_embeddings_arr.shape)
# print(corpus_embeddings_arr.shape)


In [10]:
# reshaped_test_embeddings = np.reshape(test_embeddings,(1024,-1))
# reshaped_test_embeddings.shape
# reshaped_corpus_embeddings = np.reshape(corpus_embeddings,(1024,-1))

In [11]:
# cosine_similarities = []
# for each in corpus_embeddings_arr:
#     cos_sim = dot(test_embeddings_arr, each)/(norm(test_embeddings_arr)*norm(each))
#     cosine_similarities.append(cos_sim)

In [12]:
# print(cosine_similarities)

In [13]:
# # [array([0.38826797]), array([0.6634463]), array([0.4280255]), array([0.31543467]), array([0.40113314]), array([0.22327371]), array([0.35148772]), array([0.48820173]), array([0.69228912]), array([0.30005143]), array([0.18081221]), array([0.69498284]), array([0.52173298]), array([0.40174261]), array([0.6032165]), array([0.46532465]), array([0.55992382]), array([0.48836531]), array([0.60111407]), array([0.18527497]), array([0.42137111])]
# data = pd.read_csv("sample.tsv",sep="\t")
# cor = data.to_dict(orient="records")
# print(cor[1][])

# # for each in cor.items():
# #     print(each)

In [61]:
class PaperParser():
    def __init__(self,corpus_text=None,model="small") -> None:
        """
        Model choices are:-
        1. large - length of embeddings per token is 4096
        2. small - length of embeddings per token is 1024
        3. multilingual-22-12
        """
        self.model = model
        self.models = ["small","large","multilingual-22-12"]
        self.client = cohere.Client("7lUDtMMSa1bVCEVIKEOms0jPImRnselfUQucOH5v")
        self.corpus = None
        # self.corpus_embeddings = co.embed(texts=corpus_text,model=model).embeddings
    

    def create_corpus(self,datafile,file_type="csv"):
        if type != "csv":
            data = pd.read_csv(datafile,sep="\t")
        else:
            data = pd.read_csv(datafile)
        corpus = {}
        corpus_records = data.to_dict(orient="records")

        for idx,record in enumerate(corpus_records):
            corpus[idx] = record
        self.corpus = corpus

    def get_corpus_embeddings(self):
        # indices are as follows:
        # 0 : dictionary index
        # 1 : Title
        # 2 : Language
        # 3 : Abstract
        # 4 : URL
        corpus_texts = []
        for idx in self.corpus:
            corpus_texts.append(self.corpus[idx]["ABSTRACT"])
        if len(corpus_texts) > 16:
            pass
        corpus_embeddings = self.client.embed(texts=corpus_texts,model=self.model).embeddings
        for idx,embedding in enumerate(corpus_embeddings):
            self.corpus[idx]["EMBEDDING"] = embedding
        

    def get_cos_sim(self,text: str) -> List[Tuple[str, float]]:
        """Return cosine similarity scores (sorted in descending order) of corpus documents with input text.

        Args:
            text (str): Input text to be compared against corpus.

        Returns:
            List[Tuple[str, float]]: Corpus documents and cosine similarity scores, sorted in descending order.
        """
        # Get embedding
        input_text = [text]
        text_embeddings = self.client.embed(texts=input_text,model=self.model).embeddings
        # Get cosine similarities
        res = []
        for record in self.corpus.items():
            cos_sim = dot(text_embeddings, record[1]["EMBEDDING"])/(norm(text_embeddings)*norm(record[1]["EMBEDDING"]))
            res.append((record[1]["TITLE"],float(cos_sim)))
        res.sort(key=lambda a:a[1],reverse=True)

        return res
    
    def calc_tf_idf(self,result="df",top_n=10):
        """
        Calculates the TF-IDF of the ABSTRACT field in the corpus and returns either
        the whole matrix or the top N results

        Args:
            result (str, optional): flag to toggle between returning the whole matrix or just the top n results.
                                    Set to 'df' to return whole matrix, set to 'top' to return the top n results. 
                                    Defaults to "df".
            top_n (int, optional):  number of results to return. Defaults to 5.

        Returns:
            _type_: _description_
        """
        tfidf_texts = []
        for idx in self.corpus:
            tfidf_texts.append(self.corpus[idx]["ABSTRACT"])
        vectorizer = TfidfVectorizer()
        tf_idf = vectorizer.fit_transform(tfidf_texts)
        dense = tf_idf.todense()
        dense_list = list(dense)
        output_features = vectorizer.get_feature_names_out()
        df = pd.DataFrame(dense, columns=output_features)

        if result == "top":
            top_df = []
            for row,index in df.iterrows():
                # print(index.sort_values(ascending=False)[:top_n])
                top_df.append(index.sort_values(ascending=False)[:top_n])
            
            return pd.DataFrame(top_df)
        else:
            return df

In [57]:
test_datafile = pd.read_csv("test.tsv",sep="\t")
test_abstract = test_datafile["ABSTRACT"]
test_text = list(test_abstract)
print(type(test_abstract))
print(type(test_text))

<class 'pandas.core.series.Series'>
<class 'list'>


In [62]:
test_text = "Hello world.  This is Chin Hee speaking. "
parser = PaperParser()
parser.create_corpus("corpus.tsv",file_type="tsv")
parser.get_corpus_embeddings()

cos_sim_list = parser.get_cos_sim(text=test_text)
# parser.corpus

In [63]:
ress = parser.calc_tf_idf(result="top")
ress

Unnamed: 0,shaft,flexibility,particular,those,straight,however,limited,general,compactness,facilitate,...,reciprocating,assembly,motion,first,or,soil,contact,friction,substantial,would
0,0.366323,0.20798,0.20798,0.20798,0.20798,0.20798,0.20798,0.20798,0.20798,0.20798,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,


In [17]:
print(cos_sim_list)

[('Multi-purpose gardening tool', 0.2187893977647217), ('一种园林工具的置物架', 0.2166805484796077), ('一种家用园艺工具用钢55MnB及其制备方法', 0.2012350658950696), ('园林工具刹车装置', 0.16959941680151241), ('Gardening tool', 0.13177415459425804), ('Gartengerät mit einem Stahlstiftabschnitt', 0.07983132004969051), ('Steel 55MnB for domestic gardening tools and preparation method thereof', 0.046619816534804376), ('Gardening tool with multiple interchangeable tool heads', 0.02671358093732649), ('Gardening tool', 0.011393098065587429), ('Motorized gardening tool', 0.004850248179517823), ('Portable motorised device for gardening tools', -0.00021554123597905016), ('Handle of gardening tool', -0.0033261854387948526), ('Farming and gardening tools with two sets of tines', -0.007649493867684067), ('Handle for household and gardening tools', -0.01053010994932666), ('Placing rack for gardening tools', -0.015269591915010683), ('Electrical gardening tool with a replaceable working piece', -0.020002713050965615), ('Garden tool', -0

In [18]:
res = [("Patent_B", 0.7),("Patent_C", 0.6),("Patent_A", 0.8)]
res.sort(key=lambda a:a[1],reverse=True)
res

[('Patent_A', 0.8), ('Patent_B', 0.7), ('Patent_C', 0.6)]

In [19]:
response = co.generate(  
    model='xlarge',  
    prompt = "Write me a 500 word patent application for a gardening tool",  
    max_tokens=40,  
    temperature=0.2,  
    stop_sequences=["--"])

startup_idea = response.generations

In [20]:
tfidf_texts = []
for idx in parser.corpus:
    tfidf_texts.append(parser.corpus[idx]["ABSTRACT"])

Calculate TF-IDF

In [21]:
def calc_tf_idf():
    vectorizer = TfidfVectorizer()
    tf_idf = vectorizer.fit_transform(tfidf_texts)
    dense = tf_idf.todense()
    dense_list = list(dense)
    output_features = vectorizer.get_feature_names_out()
    df = pd.DataFrame(dense, columns=output_features)
    for row,index in df.iterrows():
    print(index.sort_values(ascending=False)[0:5])

In [22]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(tfidf_texts)
dense = X.todense()
dense_list = list(dense)
dense


matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [23]:
output_features = vectorizer.get_feature_names_out()
# print(output_features)
# print(output_features[312])

In [24]:
df = pd.DataFrame(dense, columns=output_features)
df.head(5)

Unnamed: 0,00,0005,0030,008,010,025,060,10,14,15,...,设计合理,转炉吹炼,转轮上形成有至少一安装室,远离刹车环的刹车片,连铸,通过设置置物箱体,通过设置荧光材料层,避免了中碳钢热处理后硬度不均匀和硬度偏低问题,采用中碳钢,餐巾纸放置箱的前侧上设有矩形出纸口
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.223607,0.0,0.0,0.0,0.0,0.223607,0.223607,0.0,0.0,0.223607
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [45]:
for row,index in df.iterrows():
    print(index.sort_values(ascending=False)[0:5])
    # print(type(row))
    # print(row)
    # print(type(index))
    # print(index[:])
    # temp_row = row.copy()
    # temp_row.sort()
    # a = temp_row[-10:]
    # print(a)

shaft          0.366323
flexibility    0.207980
particular     0.207980
those          0.207980
straight       0.207980
Name: 0, dtype: float64
head          0.407534
the           0.385429
connecting    0.322311
of            0.235540
tool          0.233816
Name: 1, dtype: float64
placing     0.539749
box         0.359833
the         0.283055
arranged    0.224896
napkin      0.224896
Name: 2, dtype: float64
餐巾纸放置箱的前侧上设有矩形出纸口    0.223607
置物箱体的右侧设有餐巾纸放置箱      0.223607
便于工作人员快速寻找物品         0.223607
包括矩形连接架              0.223607
同时又可以减少空间的占用         0.223607
Name: 3, dtype: float64
the        0.380980
cutting    0.316670
edges      0.237502
blade      0.231275
device     0.189054
Name: 4, dtype: float64
patent       0.363033
patents      0.363033
us6612379    0.363033
google       0.363033
com          0.363033
Name: 5, dtype: float64
包括安装有电机和工作主轴的支座    0.242536
而且也提高了刹车效果         0.242536
不仅方便了结构设计          0.242536
刹车片与刹车环接触          0.242536
刹车片与刹车环相分离         0.242536
Name: 6, dtype: 

In [None]:
print(df.max(axis=1))

In [None]:
Y = vectorizer.transform(test_abstract)
print(Y.get_feat)

In [None]:
from transformers import pipeline

def get_patent_desc(text: str) -> str:
    """Generate new patent description.

    Args:
        text (str): Input text.

    Returns:
        str: New patent description.
    """
    

    generator = pipeline('text-generation', model='EleutherAI/gpt-neo-1.3B')
    res = generator(text, max_length=300, do_sample=True, temperature=0.9,device=0)

    return res[0]["generated_text"]

In [None]:
prompt = "A multipurpose gardening tool made of a lightweight composite material that is corrosion resistant with multiple built in redundancies for ease of use and maintenance in a multitude of environmental conditions"

In [None]:
result = get_patent_desc(prompt)

In [None]:
result