Topic modeling with title only

In [1]:
import pandas as pd
import os

references = pd.read_csv(os.path.join('..','results','paper_refs.csv'))

references

Unnamed: 0,Author,Title
0,"Russell S, Norvig P",Artificial Intelligence: a modern approach
1,"Witten IH, Frank E, Hall MA, et al",Data Mining: practical Machine Learning tools ...
2,"Zaki MJ, Meira Jr, W",Data Mining and analysis: fundamental concepts...
3,"Passfield L, Hopker JG",A mine of information: can sports analytics pr...
4,"Rein R, Memmert D",Big data and tactical analysis in elite soccer...
...,...,...
98,"Dalton-Barron NE, McLaren SJ, Black CJ, et al",Identifying contextual influences on training ...
99,"McLaren SJ, Weston M, Smith A, et al",Variability of physical performance and player...
100,"Oliveira WK, Jesus K, Andrade AD, et al",Monitoring training load in beach volleyball p...
101,"Düking P, Achtzehn S, Holmberg HC, Sperlich B",Integrated framework of load monitoring by a c...


In [2]:
#Using the nltk package for topic modeling
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sonayavrumyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
from nltk.corpus import stopwords
from collections import Counter
import string

#Function for cleaning and preprocessing the titles
def preprocess_titles(titles):
    prepositions = set([
        # ... (list all prepositions here) ...
    ])
    stop_words = set(stopwords.words('english')).union(prepositions)

    cleaned_titles = []
    for title in titles:
        words = title.lower().translate(str.maketrans('', '', string.punctuation)).split()
        words = [word for word in words if word not in stop_words]
        cleaned_titles.extend(words)

    return cleaned_titles

#Applying topic modeling and printing the 15 most common words and their repetition frequency
cleaned_titles = preprocess_titles(references['Title'])
word_counts = Counter(cleaned_titles)
most_common_words = word_counts.most_common(15)

print(most_common_words)

[('football', 17), ('neural', 17), ('data', 16), ('training', 16), ('based', 16), ('team', 16), ('performance', 15), ('basketball', 14), ('artificial', 13), ('sports', 13), ('analysis', 12), ('network', 11), ('using', 10), ('injury', 10), ('mining', 9)]


Topic modeling with title and abstract

In [3]:
ref_abs = pd.read_csv(os.path.join('..','results','paper_refs_abstracts.csv'))

ref_abs

Unnamed: 0,Author,Title,Abstract,Year
0,"Russell S, Norvig P",Artificial Intelligence: a modern approach,From the enraged robots in the 1920 play R.U.R...,2015.0
1,"Witten IH, Frank E, Hall MA, et al",Data Mining: practical Machine Learning tools ...,,
2,"Zaki MJ, Meira Jr, W",Data Mining and analysis: fundamental concepts...,,
3,"Passfield L, Hopker JG",A mine of information: can sports analytics pr...,This paper explores the notion that the availa...,2017.0
4,"Rein R, Memmert D",Big data and tactical analysis in elite soccer...,Until recently tactical analysis in elite socc...,2016.0
...,...,...,...,...
98,"Dalton-Barron NE, McLaren SJ, Black CJ, et al",Identifying contextual influences on training ...,"Dalton-Barron, NE, McLaren, SJ, Black, CJ, Gra...",2021.0
99,"McLaren SJ, Weston M, Smith A, et al",Variability of physical performance and player...,The aims of this study were to establish sourc...,2021.0
100,"Oliveira WK, Jesus K, Andrade AD, et al",Monitoring training load in beach volleyball p...,,
101,"Düking P, Achtzehn S, Holmberg HC, Sperlich B",Integrated framework of load monitoring by a c...,Athletes schedule their training and recovery ...,2018.0


Most Common words in Abstract 

In [19]:
def preprocess_text(texts):
    prepositions = set([
        # ... (list all prepositions here) ...
    ])
    stop_words = set(stopwords.words('english')).union(prepositions)

    cleaned_texts = []
    for text in texts:
        if isinstance(text, str):
            # Replace hyphens and slashes with spaces, then split the text into words
            words = text.lower().translate(str.maketrans('-/', '  ')).split()
            # Split on other punctuations and filter out stop words
            words = [word for part in words for word in part.translate(str.maketrans('', '', string.punctuation)).split() if word not in stop_words]
            cleaned_texts.extend(words)

    return cleaned_texts

cleaned_abstracts = preprocess_text(ref_abs['Abstract'])
word_counts = Counter(cleaned_abstracts)
most_common_words = word_counts.most_common(15)

print(most_common_words)


[('data', 66), ('training', 51), ('team', 50), ('performance', 50), ('match', 31), ('risk', 26), ('injury', 26), ('players', 26), ('used', 25), ('sports', 24), ('based', 23), ('study', 22), ('classification', 22), ('may', 21), ('decision', 21)]


Most Common Words in Abstract reduced to their root (Stemming)

In [21]:
from nltk.stem import PorterStemmer

def preprocess_text(texts):
    prepositions = set([
        # ... (list all prepositions here) ...
    ])
    stop_words = set(stopwords.words('english')).union(prepositions)
    stemmer = PorterStemmer()

    cleaned_texts = []
    for text in texts:
        if isinstance(text, str):
            # Replace hyphens and slashes with spaces
            words = text.lower().translate(str.maketrans('-/', '  ')).split()
            words = [word for part in words for word in part.translate(str.maketrans('', '', string.punctuation)).split() if word not in stop_words]
            stemmed_words = [stemmer.stem(word) for word in words]
            cleaned_texts.extend(stemmed_words)

    return cleaned_texts

cleaned_abstracts = preprocess_text(ref_abs['Abstract'])
word_counts = Counter(cleaned_abstracts)
most_common_words = word_counts.most_common(3)

print(most_common_words)


[('data', 66), ('perform', 62), ('train', 60)]


Most Common word in Abstract reduced to their root defined word (Lematization)

In [16]:
import nltk

nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/sonayavrumyan/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sonayavrumyan/nltk_data...


True

In [20]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

def get_wordnet_pos(word):
    """Map POS tag to the first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def preprocess_text(texts):
    prepositions = set([
        # ... (list all prepositions here) ...
    ])
    stop_words = set(stopwords.words('english')).union(prepositions)
    lemmatizer = WordNetLemmatizer()

    cleaned_texts = []
    for text in texts:
        if isinstance(text, str):
            # Replace hyphens and slashes with spaces
            words = text.lower().translate(str.maketrans('-/', '  ')).split()
            words = [word for part in words for word in part.translate(str.maketrans('', '', string.punctuation)).split() if word not in stop_words]
            lem_words = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in words]
            cleaned_texts.extend(lem_words)

    return cleaned_texts

cleaned_abstracts = preprocess_text(ref_abs['Abstract'])
word_counts = Counter(cleaned_abstracts)
most_common_words = word_counts.most_common(3)

print(most_common_words)


[('data', 66), ('team', 58), ('use', 51)]
