In [1]:
import os
import pandas as pd
import re
from pypdf import PdfReader

# Function to extract references from a PDF file using a context-aware pattern
def parse_references(pdf_path):
    reader = PdfReader(pdf_path)
    refs = []
    ref_section = False # Use as a flag to find the references section
    last_ref_num = 0 # To track the last reference number

    for page in reader.pages:
        # Extract text on a page by page basis
        text = page.extract_text()
        # When the "References" bit is found, cut out the text preceding it
        if "References" in text:
            ref_section = True
            text = text.split("References", 1)[1]
        if ref_section:
            # Find all matches of reference pattern
            for match in re.finditer(r'(\d+)\. (.*?\..*?)\.', text, re.DOTALL):
                current_ref_num = int(match.group(1))
                # Append reference if the current reference number is sequential
                if current_ref_num == last_ref_num + 1:
                    refs.append(match.group(2))
                    last_ref_num = current_ref_num
    # Use dot as column separator for Author and Title (as in the PDF)
    references = pd.DataFrame(refs)[0].str.split('.', n=1, expand=True)
    references.columns = ['Author', 'Title']
    # Replace newline characters with space and strip leading whitespace
    return references.applymap(lambda x: x.replace('\n', ' ').strip())

pdf = os.path.join('..','paper','s40798-019-0202-3.pdf')

# Extract references using the context-aware pattern from the uploaded PDF file
references = parse_references(pdf)

references.to_csv(os.path.join('..','results','paper_refs.csv'), index=False)

references

Unnamed: 0,Author,Title
0,"Russell S, Norvig P",Artificial Intelligence: a modern approach
1,"Witten IH, Frank E, Hall MA, et al",Data Mining: practical Machine Learning tools ...
2,"Zaki MJ, Meira Jr, W",Data Mining and analysis: fundamental concepts...
3,"Passfield L, Hopker JG",A mine of information: can sports analytics pr...
4,"Rein R, Memmert D",Big data and tactical analysis in elite soccer...
...,...,...
98,"Dalton-Barron NE, McLaren SJ, Black CJ, et al",Identifying contextual influences on training ...
99,"McLaren SJ, Weston M, Smith A, et al",Variability of physical performance and player...
100,"Oliveira WK, Jesus K, Andrade AD, et al",Monitoring training load in beach volleyball p...
101,"Düking P, Achtzehn S, Holmberg HC, Sperlich B",Integrated framework of load monitoring by a c...


In [2]:
#Using the nltk package for topic modeling
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sonayavrumyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
from nltk.corpus import stopwords
from collections import Counter
import string

# function for cleaning and preprocessing the titles
def preprocess_titles(titles):
    prepositions = set([
        # ... (list all prepositions here) ...
    ])
    stop_words = set(stopwords.words('english')).union(prepositions)

    cleaned_titles = []
    for title in titles:
        words = title.lower().translate(str.maketrans('', '', string.punctuation)).split()
        words = [word for word in words if word not in stop_words]
        cleaned_titles.extend(words)

    return cleaned_titles

#Applying topic modeling and showing the 15 most common words and their repetition frequency
cleaned_titles = preprocess_titles(references['Title'])
word_counts = Counter(cleaned_titles)
most_common_words = word_counts.most_common(15)

print(most_common_words)


[('football', 17), ('neural', 17), ('data', 16), ('training', 16), ('based', 16), ('team', 16), ('performance', 15), ('basketball', 14), ('artificial', 13), ('sports', 13), ('analysis', 12), ('network', 11), ('using', 10), ('injury', 10), ('mining', 9)]
