Searching Pubmed's database for papers by using the most common topics found by topic modeling

In [8]:
import pandas as pd
import os

#Using the nltk package for topic modeling
import nltk
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('brown')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.corpus import brown
from collections import Counter
import string
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag
import math

#Loading the CSV file with references
ref_abs = pd.read_csv(os.path.join('..','results','refs_abstracts_sys.csv'))
ref_abs.head()

#Function for cleaning and preprocessing the abstracts still using NLTK
def preprocess_text(texts):
    prepositions = set([
    
    ])
    #Filtering out common words that are "meaningless" (such as prepositions) using stop words
    stop_words = set(stopwords.words('english')).union(prepositions)

    cleaned_texts = []
    for text in texts:
        if isinstance(text, str):
            #Replace hyphens and slashes with spaces, then split the text into words
            #Lowercasing, punctuation removal, word tokenization, and stop word filtering
            words = text.lower().translate(str.maketrans('-/', '  ')).split()
            #Split on other punctuations and filter out stop words
            words = [word for part in words for word in part.translate(str.maketrans('', '', string.punctuation)).split() if word not in stop_words]
            cleaned_texts.extend(words)

    return cleaned_texts
#Preprocessing abstracts and counting
cleaned_abstracts = preprocess_text(ref_abs['Abstract'])
word_counts = Counter(cleaned_abstracts)
most_common_words_basic_A = word_counts.most_common(30)
document_word_frequencies = most_common_words_basic_A

# Load the corpus and calculate the total number of documents
documents = brown.fileids()
total_documents = len(documents)

# Calculate document frequency for each word in your list
document_frequencies = {}
for word, _ in document_word_frequencies:
    document_frequencies[word] = sum(1 for doc in documents if word in brown.words(doc))

# Calculate IDF for each word
idf_values = {word: math.log(total_documents / (df + 1)) for word, df in document_frequencies.items()}

# Example: Calculate TF-IDF for each word in your document
tf_idf = {word: freq * idf_values[word] for word, freq in document_word_frequencies}

print(idf_values)

[nltk_data] Downloading package stopwords to /home/matteo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/matteo/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/matteo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package brown to /home/matteo/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /home/matteo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


{'team': 2.631089159966082, 'data': 2.3434070875143007, 'performance': 2.0249533563957662, 'classification': 3.575550768806933, 'ball': 2.6882475738060303, 'match': 2.7181005369557116, 'indicators': 4.422848629194137, 'football': 3.2188758248682006, 'learning': 2.4304184645039304, 'based': 1.783791299578878, 'regression': 4.8283137373023015, 'analysis': 2.2256240518579173, 'used': 0.7298111649315369, 'using': 1.7037485919053417, 'accuracy': 3.170085660698769, 'machine': 2.631089159966082, 'athletes': 4.605170185988092, 'decision': 2.0714733720306593, 'field': 1.4354846053106625, 'model': 2.8134107167600364, 'study': 1.487220279709851, 'players': 3.575550768806933, 'different': 1.0216512475319812, 'outcome': 3.270169119255751, 'injury': 3.324236340526027, 'may': 0.4338645826298623, 'time': 0.11204950380862293, 'features': 2.207274913189721, 'method': 1.8078888511579387, 'models': 2.995732273553991}


In [9]:
print(tf_idf)

{'team': 73.6704964790503, 'data': 63.27199136288612, 'performance': 52.64878726628992, 'classification': 67.93546460733172, 'ball': 48.388456328508546, 'match': 43.489608591291386, 'indicators': 66.34272943791206, 'football': 48.28313737302301, 'learning': 34.02585850305503, 'based': 23.189286894525416, 'regression': 62.76807858492992, 'analysis': 26.70748862229501, 'used': 8.757733979178443, 'using': 20.4449831028641, 'accuracy': 38.041027928385226, 'machine': 28.9419807596269, 'athletes': 50.65687204586901, 'decision': 22.786207092337253, 'field': 15.790330658417288, 'model': 30.9475178843604, 'study': 16.359423076808362, 'players': 39.331058456876264, 'different': 11.238163722851793, 'outcome': 35.97186031181326, 'injury': 33.24236340526027, 'may': 4.338645826298623, 'time': 1.1204950380862293, 'features': 22.07274913189721, 'method': 18.078888511579386, 'models': 26.961590461985917}
