Searching Pubmed's database for papers by using the most common topics found by topic modeling

In [90]:
import pandas as pd
import os

# Loading the CSV file with common strings
strings = pd.read_csv(os.path.join('..','results','common_strings.csv'))
strings

Unnamed: 0,String,Frequency
0,team,47
1,use,38
2,performance,33
3,base,30
4,data,29
...,...,...
68,146a 5p,5
69,team performance indicators,10
70,artificial neural network,5
71,support vector machine,5


Subset dataset by only retaining keywords of interest

In [91]:
# Subset dataset by only retaining keywords with more than 4 letters that contain no numbers

strings = strings[strings['String'].apply(lambda x: len(x) > 4 and not any(char.isdigit() for char in x))]

# Remove some keywords we deem too generic manually

# List of strings to remove
strings_to_remove = ["model", "train", "study", "match", "turnover", "professional", "player",
                     "method", "analysis", "indicator", "accuracy", "outcome", "classify", "classifier",
                     "approach", "decision", "provide", "position", "different", "network",
                     "machine", "learning", "feature", "prediction", "logistic", "neural"]

# Further subsetting the DataFrame by removing specified entries
strings = strings[~strings['String'].isin(strings_to_remove)]
strings.head()

Unnamed: 0,String,Frequency
2,performance,33
8,football,22
10,classification,20
14,injury,19
15,basketball,19


Add a column that categorizes each keyword (ML/statistics related, sports related, injury or performance related)

In [92]:
# Keywords for categorization
ml_statistics_keywords = ["classification", "regression", "decision tree", 
                          "vector machine", "neural network", "machine learning", "data mining", 
                          "artificial neural", "neural networks", "support vector", "classification accuracy", 
                          "artificial neural network", "support vector machine", "non linear"]
sports_keywords = ["football", "basketball", "athlete", "ball possession",
                   "olympic", "sport", "soccer", "team performance", "match outcome", "individual ball",
                   "olympic games", "australian football", "team performance indicators"]
injury_performance_keywords = ["injury", "performance", "training", "ground reaction", "training load", "performance indicators"]

# Function to categorize strings
def categorize_string(s):
    if any(keyword in s for keyword in ml_statistics_keywords):
        return "ML/Statistics"
    elif any(keyword in s for keyword in sports_keywords):
        return "Sports"
    elif any(keyword in s for keyword in injury_performance_keywords):
        return "Injury/Performance"
    else:
        return "Other"

# Apply the function to create a new column
strings['Category'] = strings['String'].apply(categorize_string)
strings.head()

Unnamed: 0,String,Frequency,Category
2,performance,33,Injury/Performance
8,football,22,Sports
10,classification,20,ML/Statistics
14,injury,19,Injury/Performance
15,basketball,19,Sports


Construct the search command

In [93]:
# Grouping the strings by category and constructing the search query
search_query = ' AND '.join(strings.groupby('Category')['String'].apply(lambda x: '("' + '" OR "'.join(x) + '")'))
search_query

'("performance" OR "injury" OR "training" OR "performance indicators" OR "ground reaction" OR "training load") AND ("classification" OR "regression" OR "neural network" OR "machine learning" OR "data mining" OR "artificial neural" OR "neural networks" OR "logistic regression" OR "decision tree" OR "support vector" OR "non linear" OR "classification accuracy" OR "vector machine" OR "artificial neural network" OR "support vector machine") AND ("football" OR "basketball" OR "athlete" OR "olympic" OR "sport" OR "soccer" OR "team performance" OR "match outcome" OR "olympic games" OR "ball possession" OR "individual ball" OR "australian football" OR "team performance indicators")'

Search on Pubmed and show results

In [111]:
from Bio import Entrez

def fetch_from_pubmed(query, email):
    # The Pubmed API requires an email to be used
    Entrez.email = email
    # Set how the search should be performed
    handle = Entrez.esearch(db='pubmed',sort='relevance', term=query,
    # Set how many results to return (we do not expect more than 10000 or so based on the paper results)
    retmax='10000', retmode='xml',
    # Just like the paper, only search up to May 2018
    mindate='1900/01', maxdate='2018/05')
    results = Entrez.read(Entrez.efetch(db='pubmed', retmode='xml',
                                        id=Entrez.read(handle)['IdList']))
    return results

mazza = 'matteo.mazzarelli@gmail.com'    

paper_ids = fetch_from_pubmed(search_query, mazza)

author_list = []
title_list = []

for i, paper in enumerate(paper_ids['PubmedArticle']):
    authors = paper['MedlineCitation']['Article']['AuthorList']
    author_names = []
    for author in authors:
        # Some authors might not have a LastName or Initial, handle these cases
        last_name = author.get('LastName', '')
        initials = author.get('Initials', '')
        author_names.append(f"{last_name} {initials}".strip())

    # Join all authors' names with commas
    all_authors = ', '.join(author_names)
    author_list.append(all_authors)

    title_list.append(paper['MedlineCitation']['Article']['ArticleTitle'])

# Create a pandas df with the results
searched_papers = pd.DataFrame(list(zip(author_list, title_list)),
                               columns=['Author', 'Title'])

searched_papers.head()

Unnamed: 0,Author,Title
0,"Opar DA, Williams MD, Timmins RG, Hickey J, Du...",Eccentric hamstring strength and hamstring inj...
1,"Fuller CW, Ekstrand J, Junge A, Andersen TE, B...",Consensus statement on injury definitions and ...
2,Hölmich P,Groin injuries in athletes--development of cli...
3,"Hägglund M, Waldén M",Risk factors for acute knee injury in female y...
4,"Hewett TE, Myer GD, Ford KR, Heidt RS, Colosim...",Biomechanical measures of neuromuscular contro...


Compare papers found with papers in the references list

In [112]:
# Loading the CSV file containg authors and titles of the 58 papers used in the systematic review
sys_references = pd.read_csv(os.path.join('..','results','paper_refs.csv'))

# Check how many papers intersect between the 2 datasets (perform inner join)
common_papers = pd.merge(sys_references, searched_papers, how='inner')
common_papers.head()

Unnamed: 0,Author,Title
