In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import  TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

# scientific units
#from quantities import units
import utils # all datareading and preprocessing functionality

In [2]:
file1 = './data/abstract_set1.txt' # pesticides
file2 = './data/abstract_set2.txt' # cancer-ish
data_selection = 'abstract_clean'
label_selection = 'text_label'

# the custom class Tokenizer can also lemmatize, and remove short words, digits and stopwords
stopwords = utils.get_stopwords(custom = {'wa', 'use', 'using', 'one', 'two', 'three', 'study'}) 

tokenizer = utils.Tokenizer(stop_words=stopwords, min_length=3)

In [3]:
data = utils.read_abstract_data(negatives_path=file2, 
                                positives_path=file1,  
                                text_labels = ["control", "pesticide"])
#data = utils.read_abstract_data(file1, file2)

utils.preprocess_text(data)
data.head()

Unnamed: 0,pmid,title,abstract,label,text_label,title_clean,abstract_clean
0,29981025,Impact of Neoadjuvant Chemotherapy on Breast C...,"BACKGROUND: Breast cancer subtype, as determin...",0,control,impact of neoadjuvant chemotherapy on breast c...,"background: breast cancer subtype, as determin..."
1,29984001,Expert-Performed Endotracheal Intubation-Relat...,The aim of this study was to determine complic...,0,control,expert-performed endotracheal intubation-relat...,the aim of this study was to determine complic...
2,29988545,A case report: Addison disease caused by adren...,We report middle age man with skin hyperpigmen...,0,control,a case report: addison disease caused by adren...,we report middle age man with skin hyperpigmen...
3,29998100,An Unusual Morphological Presentation of Cutan...,Cutaneous squamous cell carcinoma (SCC) exhibi...,0,control,an unusual morphological presentation of cutan...,cutaneous squamous cell carcinoma (scc) exhibi...
4,29999256,Informing Consent: Medical Malpractice and the...,"Since the early 1990s, jurisdictions around th...",0,control,informing consent: medical malpractice and the...,"since the early 1990s, jurisdictions around th..."


In [4]:
test_set = data[['pmid', 'title', 'abstract', 'text_label', 'abstract_clean']][-10:]
test_set
# these records will serve to test the model (inspect visually) 

Unnamed: 0,pmid,title,abstract,text_label,abstract_clean
9100,39365782,Bio-efficacy of field aged novel class of long...,New classes of long-lasting insecticidal nets ...,pesticide,new classes of long-lasting insecticidal nets ...
9101,39367967,Comparative transcriptional analysis between s...,Aedes aegypti is an important vector of arbovi...,pesticide,aedes aegypti is an important vector of arbovi...
9102,39370610,Mps1-Targeted Molecular Design of Melatonin fo...,"Melatonin, a multifunctional class of natural ...",pesticide,"melatonin, a multifunctional class of natural ..."
9103,39381929,Metabolomic profiling of in vitro and in situ ...,This study was begun by establishing an in vit...,pesticide,this study was begun by establishing an in vit...
9104,39393466,Pharmacology and molecular modeling studies of...,We conducted electrophysiological and molecula...,pesticide,we conducted electrophysiological and molecula...
9105,39399211,Cytotoxicity induced by three commercial neoni...,Background: Neonicotinoid insecticides are use...,pesticide,background: neonicotinoid insecticides are use...
9106,39406000,Mutations in target gene confers resistance to...,Echinochloa phyllopogon is a noxious weed that...,pesticide,echinochloa phyllopogon is a noxious weed that...
9107,39411656,Genome-wide association study reveals the gene...,Crop resistance to herbicides is crucial for a...,pesticide,crop resistance to herbicides is crucial for a...
9108,39411737,Serum 25 hydroxycholecalciferol in periodontit...,Background: Very few studies have examined the...,pesticide,background: very few studies have examined the...
9109,39427538,Soybean isoflavones protect dopaminergic neuro...,Atrazine (ATR) is a broad-spectrum herbicide w...,pesticide,atrazine (atr) is a broad-spectrum herbicide w...


In [5]:
train_set = data[data_selection][:-10]
train_set

0       background: breast cancer subtype, as determin...
1       the aim of this study was to determine complic...
2       we report middle age man with skin hyperpigmen...
3       cutaneous squamous cell carcinoma (scc) exhibi...
4       since the early 1990s, jurisdictions around th...
                              ...                        
9095    plants are subjects of interest due to the sec...
9096    the sugarcane weevil (sphenophorus levis vauri...
9097    chlorpyrifos (cpf) has been used worldwide, bu...
9098    chlorantraniliprole (chl), a favored agricultu...
9099    multiple stimuli-responsiveness is an attracti...
Name: abstract_clean, Length: 9100, dtype: object

In [12]:
vectorizer = TfidfVectorizer(tokenizer=tokenizer, max_features=5000) # bigger vocabulary seems appropriate here

# create a data matrix from the overviews
X = vectorizer.fit_transform(train_set)
X



<9100x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 605968 stored elements in Compressed Sparse Row format>

In [13]:
feature_names = vectorizer.get_feature_names_out()

In [14]:
# generate a mapping from paper pubmed id -> index (in df)
#paper2idx = pd.Series(data.index, index=data['pmid'])

In [15]:
def sort_coo(coo_matrix):
    """Sort a dict with highest score"""
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature, score
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

In [16]:
def get_keywords(vectorizer, feature_names, doc, top_n=10):
    """Return top k keywords from a doc using TF-IDF method"""

    #generate tf-idf for the given document
    tf_idf_vector = vectorizer.transform([doc])
    
    #sort the tf-idf vectors by descending order of scores
    sorted_items=sort_coo(tf_idf_vector.tocoo())

    #extract only TOP_K_KEYWORDS
    keywords=extract_topn_from_vector(feature_names, sorted_items, top_n)
    
    return list(keywords.keys())

In [18]:
test_set['keywords'] = None
test_set

Unnamed: 0,pmid,title,abstract,text_label,abstract_clean,keywords
9100,39365782,Bio-efficacy of field aged novel class of long...,New classes of long-lasting insecticidal nets ...,pesticide,new classes of long-lasting insecticidal nets ...,
9101,39367967,Comparative transcriptional analysis between s...,Aedes aegypti is an important vector of arbovi...,pesticide,aedes aegypti is an important vector of arbovi...,
9102,39370610,Mps1-Targeted Molecular Design of Melatonin fo...,"Melatonin, a multifunctional class of natural ...",pesticide,"melatonin, a multifunctional class of natural ...",
9103,39381929,Metabolomic profiling of in vitro and in situ ...,This study was begun by establishing an in vit...,pesticide,this study was begun by establishing an in vit...,
9104,39393466,Pharmacology and molecular modeling studies of...,We conducted electrophysiological and molecula...,pesticide,we conducted electrophysiological and molecula...,
9105,39399211,Cytotoxicity induced by three commercial neoni...,Background: Neonicotinoid insecticides are use...,pesticide,background: neonicotinoid insecticides are use...,
9106,39406000,Mutations in target gene confers resistance to...,Echinochloa phyllopogon is a noxious weed that...,pesticide,echinochloa phyllopogon is a noxious weed that...,
9107,39411656,Genome-wide association study reveals the gene...,Crop resistance to herbicides is crucial for a...,pesticide,crop resistance to herbicides is crucial for a...,
9108,39411737,Serum 25 hydroxycholecalciferol in periodontit...,Background: Very few studies have examined the...,pesticide,background: very few studies have examined the...,
9109,39427538,Soybean isoflavones protect dopaminergic neuro...,Atrazine (ATR) is a broad-spectrum herbicide w...,pesticide,atrazine (atr) is a broad-spectrum herbicide w...,


In [26]:
#result = []
for i, doc in enumerate(test_set[data_selection]):
    #df = {}
    #df['full_text'] = doc
    keywords = get_keywords(vectorizer, feature_names, doc)
    #df['top_keywords'] = keywords
    test_set.iloc[i, 5] = ", ".join(keywords)
    #result.append(df)
    
#final = pd.DataFrame(result)
#final
test_set
test_set.to_csv('tf_idf_keyword_extraction_test.csv', sep="\t")