Searching Pubmed's database for papers by using the most common topics found by topic modeling

In [58]:
import pandas as pd
import os

# Loading the CSV file with common strings
strings = pd.read_csv(os.path.join('..','results','common_strings.csv'))
strings

Unnamed: 0,String,Frequency
0,team,49
1,data,37
2,performance,34
3,model,30
4,base,26
...,...,...
75,team performance indicators,10
76,artificial neural network,5
77,support vector machine,5
78,one versus one,4


Subset dataset by only retaining keywords of interest

In [59]:
# Subset dataset by only retaining keywords with more than 4 letters
strings = strings[strings['String'].apply(lambda x: len(x) > 4)]

# Remove some keywords we deem too generic manually
strings_to_remove = ["model", "train", "study", "match", "turnover", "professional", "player",
                     "method", "analysis", "indicator", "accuracy", "outcome", "classify", "classifier",
                     "approach", "decision", "provide", "position", "different", "network",
                     "machine", "learning", "feature", "prediction", "logistic", "neural", "factor",
                     "one versus", "versus one", "time course"]

# Further subsetting the DataFrame by removing specified entries
strings = strings[~strings['String'].isin(strings_to_remove)]
strings.head()

Unnamed: 0,String,Frequency
2,performance,34
7,football,26
8,classification,22
10,injury,21
15,basketball,20


Add a column that categorizes each keyword (ML/statistics related, sports related, injury or performance related)

In [60]:
# Keywords for categorization
ml_statistics_keywords = ["classification", "regression", "decision tree", 
                          "vector", "neural network", "machine", "data mining", 
                          "artificial neural", "classification accuracy", 
                          "non linear", "deep learning"]
sports_keywords = ["football", "basketball", "athlete", "ball", "one versus one",
                   "olympic", "sport", "soccer", "team", "match"]
injury_performance_keywords = ["injury", "performance", "training", "reaction",
                               "risk factors", "activity recognition", "impact detection"]

# Function to categorize strings
def categorize_string(s):
    if any(keyword in s for keyword in ml_statistics_keywords):
        return "ML/Statistics"
    elif any(keyword in s for keyword in sports_keywords):
        return "Sports"
    elif any(keyword in s for keyword in injury_performance_keywords):
        return "Injury/Performance"
    else:
        return "Other"

# Apply the function to create a new column
strings['Category'] = strings['String'].apply(categorize_string)
strings.head()

Unnamed: 0,String,Frequency,Category
2,performance,34,Injury/Performance
7,football,26,Sports
8,classification,22,ML/Statistics
10,injury,21,Injury/Performance
15,basketball,20,Sports


Construct the search command

In [65]:
# Grouping the strings by category and constructing the search query for title and abstract only
search_query = ' AND '.join(strings.groupby('Category')['String'].apply(lambda x: '("' + '"[Title/Abstract] OR "'.join(x) + '"[Title/Abstract])'))
search_query

'("performance"[Title/Abstract] OR "injury"[Title/Abstract] OR "training"[Title/Abstract] OR "performance indicators"[Title/Abstract] OR "ground reaction"[Title/Abstract] OR "training load"[Title/Abstract] OR "activity recognition"[Title/Abstract] OR "reaction force"[Title/Abstract] OR "risk factors"[Title/Abstract] OR "impact detection"[Title/Abstract] OR "ground reaction force"[Title/Abstract]) AND ("classification"[Title/Abstract] OR "regression"[Title/Abstract] OR "neural network"[Title/Abstract] OR "machine learning"[Title/Abstract] OR "data mining"[Title/Abstract] OR "artificial neural"[Title/Abstract] OR "neural networks"[Title/Abstract] OR "logistic regression"[Title/Abstract] OR "decision tree"[Title/Abstract] OR "support vector"[Title/Abstract] OR "non linear"[Title/Abstract] OR "classification accuracy"[Title/Abstract] OR "vector machine"[Title/Abstract] OR "deep learning"[Title/Abstract] OR "artificial neural network"[Title/Abstract] OR "support vector machine"[Title/Abstra

Search on Pubmed and show results

In [76]:
from Bio import Entrez

def fetch_from_pubmed(query, email):
    # The Pubmed API requires an email to be used
    Entrez.email = email
    # Set how the search should be performed
    handle = Entrez.esearch(db='pubmed',sort='relevance', term=query,
    # Set how many results to return (we do not expect more than 10000 or so based on the paper results)
    retmax='10000', retmode='xml',
    # Just like the paper, only search up to May 2018
    mindate='1900/01', maxdate='2018/05')
    results = Entrez.read(Entrez.efetch(db='pubmed', retmode='xml',
                                        id=Entrez.read(handle)['IdList']))
    return results

mazza = 'matteo.mazzarelli@gmail.com'    

paper_ids = fetch_from_pubmed(search_query, mazza)

author_list = []
title_list = []

for i, paper in enumerate(paper_ids['PubmedArticle']):
    authors = paper['MedlineCitation']['Article']['AuthorList']
    author_names = []
    for author in authors:
        # Some authors might not have a LastName or Initial, handle these cases
        last_name = author.get('LastName', '')
        initials = author.get('Initials', '')
        author_names.append(f"{last_name} {initials}".strip())

    # Join all authors' names with commas
    all_authors = ', '.join(author_names)
    author_list.append(all_authors)

    title_list.append(paper['MedlineCitation']['Article']['ArticleTitle'])

# Create a pandas df with the results
searched_papers = pd.DataFrame(list(zip(author_list, title_list)),
                               columns=['Author', 'Title'])

searched_papers.head()

Unnamed: 0,Author,Title
0,"Hägglund M, Waldén M",Risk factors for acute knee injury in female y...
1,"Opar DA, Williams MD, Timmins RG, Hickey J, Du...",Eccentric hamstring strength and hamstring inj...
2,"Hewett TE, Myer GD, Ford KR, Heidt RS, Colosim...",Biomechanical measures of neuromuscular contro...
3,"Reale R, Slater G, Burke LM",Acute-Weight-Loss Strategies for Combat Sports...
4,"Plisky PJ, Rauh MJ, Kaminski TW, Underwood FB",Star Excursion Balance Test as a predictor of ...


Compare papers found with papers in the references list

In [75]:
# Loading the CSV file containg authors and titles of the 58 papers used in the systematic review
sys_references = pd.read_csv(os.path.join('..','results','paper_refs.csv'))

# Remove special characters from titles in order to account for inconsistencies
searched_papers['Title'] = searched_papers['Title'].str.replace(r"[^A-Za-z0-9 ]", "", regex=True)
sys_references['Title'] = sys_references['Title'].str.replace(r"[^A-Za-z0-9 ]", "", regex=True)

# Check how many papers intersect between the 2 datasets (perform inner join)
common_papers = pd.merge(sys_references, searched_papers, how='inner')
common_papers

Unnamed: 0,Author,Title
0,"Rein R, Memmert D",Big data and tactical analysis in elite soccer...
1,"Novatchkov H, Baca A",Artificial intelligence in sports on the examp...
2,"Kempe M, Grunz A, Memmert D",Detecting tactical patterns in basketball comp...
3,"Link D, Hoernig M",Individual ball possession in soccer
4,"Robertson S, Back N, Bartlett JD",Explaining match outcome in elite Australian R...
5,"Ertelt T, Solomonovs I, Gronwald T",Enhancement of force patterns classification b...
